midterm-02

library(tibble)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(haven)
library(ggplot2)
body.demo <- read_csv("/Users/yunis/Desktop/NHANES_v1(1).csv")

## Rows: 9756 Columns: 73

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (72): seqn, sddsrvyr, ridstatr, riagendr, ridageyr, ridagemn, ridreth1, ...
## lgl  (1): bmihead

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names (body.demo)

##  [1] "seqn"     "sddsrvyr" "ridstatr" "riagendr" "ridageyr" "ridagemn"
##  [7] "ridreth1" "ridreth3" "ridexmon" "ridexagy" "ridexagm" "dmqmiliz"
## [13] "dmqadfc"  "dmdborn4" "dmdcitzn" "dmdyrsus" "dmdeduc3" "dmdeduc2"
## [19] "dmdmartl" "ridexprg" "sialang"  "siaproxy" "siaintrp" "fialang" 
## [25] "fiaproxy" "fiaintrp" "mialang"  "miaproxy" "miaintrp" "aialanga"
## [31] "wtint2yr" "wtmec2yr" "sdmvpsu"  "sdmvstra" "indhhin2" "indfmin2"
## [37] "indfmpir" "dmdhhsiz" "dmdfmsiz" "dmdhhsza" "dmdhhszb" "dmdhhsze"
## [43] "dmdhrgnd" "dmdhrage" "dmdhrbr4" "dmdhredu" "dmdhrmar" "dmdhsedu"
## [49] "bmdstats" "bmxwt"    "bmiwt"    "bmxrecum" "bmirecum" "bmxhead" 
## [55] "bmihead"  "bmxht"    "bmiht"    "bmxbmi"   "bmdbmic"  "bmxleg"  
## [61] "bmileg"   "bmxarml"  "bmiarml"  "bmxarmc"  "bmiarmc"  "bmxwaist"
## [67] "bmiwaist" "bmxsad1"  "bmxsad2"  "bmxsad3"  "bmxsad4"  "bmdavsad"
## [73] "bmdsadcm"

# Dependent Variable = bmxbmi 
# independent variables#1  Education level - Adults 20+ (DMDEDUC2) # 
# independent variables#2 Annual family income (INDFMIN2) # 

body.demo.new <- data.frame(body.demo$bmxbmi, body.demo$dmdeduc2, body.demo$indfmin2)
names (body.demo.new)

## [1] "body.demo.bmxbmi"   "body.demo.dmdeduc2" "body.demo.indfmin2"

dim (body.demo.new)

## [1] 9756    3

colnames(body.demo.new) <- c("bmi", "edu", "income")
body.demo.new <- na.omit(body.demo.new)

lm(bmi ~ edu, data=body.demo.new)

## 
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
## 
## Coefficients:
## (Intercept)          edu  
##     30.3870      -0.4584

lm(bmi ~ income, data=body.demo.new)

## 
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
## 
## Coefficients:
## (Intercept)       income  
##    29.03509     -0.02216

# remove missing value # 
body.demo$dmdeduc2[body.demo$dmdeduc2==7] <- NA 
body.demo$dmdeduc2[body.demo$dmdeduc2==9] <- NA 
edu <- body.demo$dmdeduc2
edu <- na.omit(edu)
summary(edu)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   3.462   5.000   5.000

# remove missing value # 
body.demo$indfmin2[body.demo$indfmin2==77] <- NA 
body.demo$indfmin2[body.demo$indfmin2==99] <- NA 
income <- body.demo$indfmin2
income <- na.omit(income)
summary(income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   7.000   7.832  12.000  15.000

# Theorize why and how each of two the independent variables may affect the body mass index.
# in this exercise, I will use the education level for adults 20+ as independent variables and family income as independent variables; the dependent Variable will be BMI. In theory, education will affect the body mass index. This time, I want to see if a person's partner education and family income will have an effect on the body mass index #

# fitting the education level#
# Obtain and interpret the regression coefficient estimates # 

# the multiple R-squared is less than 1 at 0.00722, and it is less 1%, which means education level can't explain the bmi value # 
fit.edu <- lm(bmi ~ edu, data=body.demo.new)
summary(fit.edu)

## 
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.629  -4.829  -1.195   3.347  53.088 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.38697    0.27595 110.116  < 2e-16 ***
## edu         -0.45845    0.07444  -6.158 7.91e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.866 on 5215 degrees of freedom
## Multiple R-squared:  0.00722,    Adjusted R-squared:  0.007029 
## F-statistic: 37.92 on 1 and 5215 DF,  p-value: 7.906e-10

nobs(fit.edu) # 5217

## [1] 5217

anova(fit.edu)

## Analysis of Variance Table
## 
## Response: bmi
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## edu          1   1788 1787.78  37.924 7.906e-10 ***
## Residuals 5215 245841   47.14                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# finding the coefficient for the education level and the body mass index# 
# for one level increase in education, a person's bmi will decrease by -0.45845 # 
coef(fit.edu)

## (Intercept)         edu 
##  30.3869697  -0.4584479

summary(fit.edu)

## 
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.629  -4.829  -1.195   3.347  53.088 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.38697    0.27595 110.116  < 2e-16 ***
## edu         -0.45845    0.07444  -6.158 7.91e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.866 on 5215 degrees of freedom
## Multiple R-squared:  0.00722,    Adjusted R-squared:  0.007029 
## F-statistic: 37.92 on 1 and 5215 DF,  p-value: 7.906e-10

# the starting value is 1 （Less than 9th grade), and max is 5 (College graduate or above), both 7 (refuse) and 9 (don't know) are removed # 
confint(fit.edu, level=0.95)

##                  2.5 %     97.5 %
## (Intercept) 29.8459843 30.9279551
## edu         -0.6043904 -0.3125053

# predicted values, this is a vector of 5217 element # 
nobs(fit.edu)

## [1] 5217

bmi.edu <- fitted(fit.edu)
# this is a skewed distribution # 
hist(bmi.edu)

summary(bmi.edu)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.26   28.09   28.55   28.79   29.01   29.93

# in this predicted model adult for those with 
# adult with no education have a predicted bmi value of 30.38697, 
# adult with with less than 9th grade of education have a predicted bmi value of 29.92852, 
# adult with with 9-11th grade education have a predicted bmi value of 29.47007, 
# adult with with high school graduate/GED or equivalent have a predicted bmi value of 29.01163, 
# adult with with some college or AA degree have a predicted bmi value of 28.55318, 
# adult with with college graduate or above have a predicted bmi value of 28.09473 
bmi.edu.plot <- predict(fit.edu, data.frame(edu=seq(0,5)))
bmi.edu.plot

##        1        2        3        4        5        6 
## 30.38697 29.92852 29.47007 29.01163 28.55318 28.09473

# predicted values and the 95% confidence bands and the 95% prediction bands#
# confidence bands for the predicted mean values of dependent variable (bmi), at 95% confidence # 
predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")

##        fit      lwr      upr
## 1 30.38697 29.84598 30.92796
## 2 29.92852 29.52143 30.33561
## 3 29.47007 29.18480 29.75535
## 4 29.01163 28.81254 29.21071
## 5 28.55318 28.35196 28.75439
## 6 28.09473 27.80501 28.38445

# prediction bands for the predicted mean values of dependent variable (bmi), at 95% confidence # 
predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")

##        fit      lwr      upr
## 1 30.38697 16.91600 43.85794
## 2 29.92852 16.46226 43.39478
## 3 29.47007 16.00694 42.93320
## 4 29.01163 15.55005 42.47321
## 5 28.55318 15.09157 42.01479
## 6 28.09473 14.63151 41.55796

# Plot the confidence and prediction bands # 
ci.lwr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")[,2]
ci.upr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")[,3]
pi.lwr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")[,2]
pi.upr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")[,3]
edu.plot <- seq(0,5)
plot(edu.plot, bmi.edu.plot, ylim=range(pi.lwr,pi.upr),type="l", col="blue")
lines(edu.plot, ci.lwr, lty=6, col="pink")
lines(edu.plot, ci.upr, lty=6, col="pink")
lines(edu.plot, pi.lwr, lty=9, col="green")
lines(edu.plot, pi.upr, lty=9, col="green")

# fitting the annual family income #
# Obtain and interpret the regression coefficient estimates # 

# the multiple R-squared is less than 1 at  0.002629, and it is less 1%, which means annual family income can't explain the bmi value # 
fit.income <- lm(bmi ~ income, data=body.demo.new)
summary(fit.income)

## 
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.369  -4.803  -1.125   3.397  53.109 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.035091   0.115722 250.904  < 2e-16 ***
## income      -0.022155   0.005976  -3.708 0.000211 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.882 on 5215 degrees of freedom
## Multiple R-squared:  0.002629,   Adjusted R-squared:  0.002438 
## F-statistic: 13.75 on 1 and 5215 DF,  p-value: 0.0002115

nobs(fit.income) #5217

## [1] 5217

anova(fit.income)

## Analysis of Variance Table
## 
## Response: bmi
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## income       1    651  650.98  13.746 0.0002115 ***
## Residuals 5215 246977   47.36                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# finding the coefficient for the education level and the body mass index# 
# for one level increase in annual family income, bmi will decrease by 2% #
coef(fit.income)

## (Intercept)      income 
## 29.03509149 -0.02215547

summary(fit.income)

## 
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.369  -4.803  -1.125   3.397  53.109 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.035091   0.115722 250.904  < 2e-16 ***
## income      -0.022155   0.005976  -3.708 0.000211 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.882 on 5215 degrees of freedom
## Multiple R-squared:  0.002629,   Adjusted R-squared:  0.002438 
## F-statistic: 13.75 on 1 and 5215 DF,  p-value: 0.0002115

# the starting value is 1 （$ 0 to $ 4,999), and max is 15 ($100,000 and Over), both 71 (refuse) and 99 (don't know) are removed # 
confint(fit.income, level=0.95)

##                   2.5 %      97.5 %
## (Intercept) 28.80822778 29.26195520
## income      -0.03387062 -0.01044032

# predicted values, this is a vector of 5217 element # 
nobs(fit.income)

## [1] 5217

bmi.income <- fitted(fit.income)
# this is a skewed distribution # 
hist(bmi.income)

# in this predicted model adult for those with 
bmi.income.plot <- predict(fit.income, data.frame(income=seq(0,15)))
bmi.income.plot

##        1        2        3        4        5        6        7        8 
## 29.03509 29.01294 28.99078 28.96863 28.94647 28.92431 28.90216 28.88000 
##        9       10       11       12       13       14       15       16 
## 28.85785 28.83569 28.81354 28.79138 28.76923 28.74707 28.72491 28.70276

predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")

##         fit      lwr      upr
## 1  29.03509 28.80823 29.26196
## 2  29.01294 28.79251 29.23336
## 3  28.99078 28.77635 29.20522
## 4  28.96863 28.75970 29.17756
## 5  28.94647 28.74252 29.15042
## 6  28.92431 28.72478 29.12385
## 7  28.90216 28.70644 29.09788
## 8  28.88000 28.68746 29.07255
## 9  28.85785 28.66781 29.04789
## 10 28.83569 28.64746 29.02393
## 11 28.81354 28.62639 29.00068
## 12 28.79138 28.60460 28.97817
## 13 28.76923 28.58207 28.95638
## 14 28.74707 28.55881 28.93533
## 15 28.72491 28.53483 28.91500
## 16 28.70276 28.51016 28.89536

predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")

##         fit      lwr      upr
## 1  29.03509 15.54199 42.52819
## 2  29.01294 15.51994 42.50593
## 3  28.99078 15.49788 42.48368
## 4  28.96863 15.47582 42.46143
## 5  28.94647 15.45374 42.43920
## 6  28.92431 15.43165 42.41698
## 7  28.90216 15.40955 42.39477
## 8  28.88000 15.38744 42.37257
## 9  28.85785 15.36532 42.35038
## 10 28.83569 15.34319 42.32820
## 11 28.81354 15.32105 42.30603
## 12 28.79138 15.29890 42.28387
## 13 28.76923 15.27674 42.26172
## 14 28.74707 15.25457 42.23958
## 15 28.72491 15.23238 42.21745
## 16 28.70276 15.21019 42.19533

ci.lwr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")[,2]
ci.upr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")[,3]
pi.lwr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")[,2]
pi.upr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")[,3]
income.plot <- seq(0,15)
plot(income.plot, bmi.income.plot, ylim=range(pi.lwr,pi.upr),type="l", col="orange")
lines(income.plot, ci.lwr, lty=6, col="#009999")
lines(income.plot, ci.upr, lty=6, col="#009999")
lines(income.plot, pi.lwr, lty=9, col="grey")
lines(income.plot, pi.upr, lty=9, col="grey")

# 2 random sets # 
n.sub <- round(nrow(body.demo.new)/2)
set.seed(2021)
M <- 100
MSE <- vector() 
for(i in 1:M){
ix.train <- sample(1:nrow(body.demo.new), n.sub, replace=F) # indexing the random set # 
hrs.train <- body.demo.new[ix.train,] # setting sequence # 
  ix.test <- which(!(1:nrow(body.demo.new)%in%ix.train))
  hrs.test <- body.demo.new[ix.test,]
  head(hrs.test)
  fit <- lm(bmi ~ edu, data=hrs.train)
  summary(fit)
  yhat <- predict(fit, hrs.test)
  MSE[i] <- mean((hrs.test$bmi-yhat)^2)}
hist(MSE)

summary(MSE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   43.76   45.70   46.97   46.93   48.10   49.91

MSE.02 <- vector() 
for(i in 1:M){
ix.train <- sample(1:nrow(body.demo.new), n.sub, replace=F) # indexing the random set # 
hrs.train <- body.demo.new[ix.train,] # setting sequence # 
  ix.test <- which(!(1:nrow(body.demo.new)%in%ix.train))
  hrs.test <- body.demo.new[ix.test,]
  head(hrs.test)
  fit.02 <- lm(bmi ~ income, data=hrs.train)
  summary(fit.02)
  yhat.02 <- predict(fit, hrs.test)
  MSE.02[i] <- mean((hrs.test$bmi-yhat.02)^2)}
hist(MSE.02)

summary(MSE.02)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   43.42   45.98   47.14   47.00   47.89   50.04