library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
##   hhidpn = col_integer(),
##   raedyrs = col_integer(),
##   raeduc = col_character(),
##   rameduc = col_integer(),
##   rafeduc = col_integer(),
##   ragender = col_character(),
##   raracem = col_character(),
##   r1agey_b = col_integer(),
##   r1agey_e = col_integer(),
##   r1agey_m = col_integer(),
##   r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col     row col     expected           actual file                             expected   <int> <chr>   <chr>              <chr>  <chr>                            actual 1 12553 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)

HRS - Health Retirement Study

hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00  
##                  NA's   :1

RAEDYRS - Respondent’s Education in Years

hrs.new = na.omit(hrs.new)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00
colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")

plot(edyrs,bmi)

plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")

qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)

boxplot(bmi,ylab="BMI")

plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")

### Linear Model on the effect that Years of Education has on Body Mass Index of a population using the Health Retirement Study dataset. Predictor is the Years of Education, and the target response variable is the Body Mass Index. More generally, this seeks to measure the correlation between educational attainment and obesity.

fit=lm(bmi~edyrs)
summary(fit)
## 
## Call:
## lm(formula = bmi ~ edyrs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.791  -3.401  -0.701   2.517  76.253 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.06242    0.17460  166.45   <2e-16 ***
## edyrs       -0.16348    0.01402  -11.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared:  0.01064,    Adjusted R-squared:  0.01056 
## F-statistic:   136 on 1 and 12649 DF,  p-value: < 2.2e-16

16 refers to college, 12 refers to high school. negative association.

(16-12)*(-.16348)
## [1] -0.65392

Coefficients are two unknown constants that represent the intercept and slope terms in the linear model. The intercept is the expected BMI value when a certain number of years of education is obtained.

coefficients(fit)
## (Intercept)       edyrs 
##   29.062423   -0.163484

For every year of schooling obtained, the individual’s BMI can decrease by 0.163484. The Residual Standard Error which measures the variance that the BMI can vary from the true linear regression line, is 5.153. This would give a percentage error of 17.73%

confint(fit,level = 0.99)
##                  0.5 %     99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs       -0.1995982 -0.1273697

The F-statistic at 136 is significantly larger than 1, which indicates that there may be a relationship between Years of Schooling and BMI.

bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.28   26.77   27.10   27.10   27.26   29.06
predict(fit,data.frame(edyrs=seq(0,17)))
##        1        2        3        4        5        6        7        8 
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803 
##        9       10       11       12       13       14       15       16 
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016 
##       17       18 
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
##         fit      lwr      upr
## 1  29.06242 28.72018 29.40466
## 2  28.89894 28.58313 29.21475
## 3  28.73545 28.44589 29.02502
## 4  28.57197 28.30839 28.83555
## 5  28.40849 28.17055 28.64642
## 6  28.24500 28.03226 28.45774
## 7  28.08152 27.89333 28.26971
## 8  27.91803 27.75347 28.08260
## 9  27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit      lwr      upr
## 1  29.06242 18.95654 39.16831
## 2  28.89894 18.79392 39.00396
## 3  28.73545 18.63122 38.83969
## 4  28.57197 18.46845 38.67550
## 5  28.40849 18.30560 38.51138
## 6  28.24500 18.14268 38.34733
## 7  28.08152 17.97968 38.18336
## 8  27.91803 17.81661 38.01946
## 9  27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")

###Prediction accuracy

cor(bmi,bmi.hat)
## [1] 0.1031405

Mean absolute percentage error

mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649

Assumption Linearity

plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")

###Assumption that mean of residuals is zero

mean(bmi.resid)
## [1] -2.980676e-17

Assumption Homoscedasticity of residuals

plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

###Assumption Normality of residuals

qqnorm(bmi.resid)
qqline(bmi.resid,col="red")

bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")

###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected

cor.test(edyrs,bmi.resid)
## 
##  Pearson's product-moment correlation
## 
## data:  edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01742582  0.01742582
## sample estimates:
##         cor 
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)

bmi2 = bmi^2
mean(bmi2)
## [1] 761.1097
sd(bmi2)
## [1] 324.7898
boxplot(bmi2,ylab="BMI SQUARED")

plot(density(bmi2),main = "Density Plot: BMI SQUARED", xlab = "BMI")

qqnorm(bmi2,main = "Population Distribution of BMI SQUARED")
qqline(bmi2)

plot(edyrs,bmi2,xlab="years of schooling",ylab="BMI SQUARED")

fit4=lm(bmi2~edyrs)
summary(fit4)
## 
## Call:
## lm(formula = bmi2 ~ edyrs)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -626.7 -199.6  -64.3  116.4 9824.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 877.0285    10.9539   80.06   <2e-16 ***
## edyrs        -9.6449     0.8795  -10.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 323.3 on 12649 degrees of freedom
## Multiple R-squared:  0.009418,   Adjusted R-squared:  0.00934 
## F-statistic: 120.3 on 1 and 12649 DF,  p-value: < 2.2e-16

16 refers to college, 12 refers to high school. negative association.

(16-12)*(-9.6449)
## [1] -38.5796

Through the BMI squared model, for every year of schooling obtained, the individual’s BMI can decrease by 9.6449. The Residual Standard Error which measures the variance that the BMI can vary from the true linear regression line, is 323.3. This would give a percentage error of 36.86%

coefficients(fit4)
## (Intercept)       edyrs 
##  877.028550   -9.644913

The F-statistic at 120.3 is significantly larger than 1, which indicates that there may be a relationship between Years of Schooling and BMI.

confint(fit4,level = 0.99)
##                 0.5 %     99.5 %
## (Intercept) 848.80887 905.248226
## edyrs       -11.91064  -7.379187
bmi2.hat=fitted(fit4)
bmi2.resid4=resid(fit4)
summary(bmi2.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   713.1   742.0   761.3   761.1   770.9   877.0
predict(fit4,data.frame(edyrs=seq(0,17)))
##        1        2        3        4        5        6        7        8 
## 877.0285 867.3836 857.7387 848.0938 838.4489 828.8040 819.1591 809.5142 
##        9       10       11       12       13       14       15       16 
## 799.8692 790.2243 780.5794 770.9345 761.2896 751.6447 741.9998 732.3549 
##       17       18 
## 722.7099 713.0650
predict(fit4,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
##         fit      lwr      upr
## 1  877.0285 855.5572 898.4999
## 2  867.3836 847.5706 887.1966
## 3  857.7387 839.5719 875.9056
## 4  848.0938 831.5572 864.6304
## 5  838.4489 823.5216 853.3762
## 6  828.8040 815.4572 842.1508
## 7  819.1591 807.3526 830.9655
## 8  809.5142 799.1899 819.8384
## 9  799.8692 790.9399 808.7986
## 10 790.2243 782.5550 797.8937
## 11 780.5794 773.9576 787.2012
## 12 770.9345 765.0335 776.8355
## 13 761.2896 755.6558 766.9234
## 14 751.6447 745.7625 757.5269
## 15 741.9998 735.4115 748.5880
## 16 732.3549 724.7290 739.9807
## 17 722.7099 713.8304 731.5895
## 18 713.0650 702.7946 723.3355
predict(fit4,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit       lwr      upr
## 1  877.0285 243.00774 1511.049
## 2  867.3836 233.41683 1501.350
## 3  857.7387 223.82122 1491.656
## 4  848.0938 214.22094 1481.967
## 5  838.4489 204.61596 1472.282
## 6  828.8040 195.00631 1462.602
## 7  819.1591 185.39196 1452.926
## 8  809.5142 175.77293 1443.255
## 9  799.8692 166.14920 1433.589
## 10 790.2243 156.52079 1423.928
## 11 780.5794 146.88769 1414.271
## 12 770.9345 137.24990 1404.619
## 13 761.2896 127.60742 1394.972
## 14 751.6447 117.96025 1385.329
## 15 741.9998 108.30839 1375.691
## 16 732.3549  98.65184 1366.058
## 17 722.7099  88.99060 1356.429
## 18 713.0650  79.32467 1346.805
ci.lwr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit4)
lines(sort(edyrs),ci.lwr,lty=3,col="orange")
lines(sort(edyrs),ci.upr,lty=3,col="orange")
lines(sort(edyrs),pi.lwr,lty=2,col="purple")
lines(sort(edyrs),pi.upr,lty=2,col="purple")

###Prediction accuracy

cor(bmi,bmi2.hat)
## [1] 0.1031405

Mean absolute percentage error

mean(abs((bmi - bmi2.hat))/bmi)
## [1] 28.00212

Assumption Linearity

plot(edyrs,bmi2,xlab = "Years of Schooling",ylab = "BMI SQRT")
abline(fit4,col="purple")

###Assumption that mean of residuals is zero

plot(bmi2.hat,bmi2.resid4,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

###Assumption Normality of residuals

qqnorm(bmi2.resid4)
qqline(bmi2.resid4,col="red")

bmi2.stdresid4 = rstandard(fit)
qqnorm(bmi2.stdresid4,ylab = "standardized residuals")
qqline(bmi2.stdresid4,col="red")

###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected

cor.test(edyrs,bmi2.resid4)
## 
##  Pearson's product-moment correlation
## 
## data:  edyrs and bmi2.resid4
## t = 5.9367e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01742582  0.01742582
## sample estimates:
##          cor 
## 5.278597e-16

Results indicate a negative correlation between Years of schooling and BMI where an increase in Years of Schooling reveals a decrease in BMI, and a lower number of years of schooling indicate a higher BMI.

par(mfrow=c(2,2))
plot(fit4)