library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
##   hhidpn = col_integer(),
##   raedyrs = col_integer(),
##   raeduc = col_character(),
##   rameduc = col_integer(),
##   rafeduc = col_integer(),
##   ragender = col_character(),
##   raracem = col_character(),
##   r1agey_b = col_integer(),
##   r1agey_e = col_integer(),
##   r1agey_m = col_integer(),
##   r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col     row col     expected           actual file                             expected   <int> <chr>   <chr>              <chr>  <chr>                            actual 1 12553 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)

HRS - Health Retirement Study

hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00  
##                  NA's   :1
hrs.new = na.omit(hrs.new)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00

RAEDYRS - Respondent’s Education in Years

colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")

plot(edyrs,bmi)

plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")

qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)

boxplot(bmi,ylab="BMI")

plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")

### Linear Model on the effect that Years of Education has on Body Mass Index of a population using the Health Retirement Study dataset. Predictor is the Years of Education, and the target response variable is the Body Mass Index. More generally, this seeks to measure the correlation between educational attainment and obesity.

fit=lm(bmi~edyrs)
summary(fit)
## 
## Call:
## lm(formula = bmi ~ edyrs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.791  -3.401  -0.701   2.517  76.253 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.06242    0.17460  166.45   <2e-16 ***
## edyrs       -0.16348    0.01402  -11.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared:  0.01064,    Adjusted R-squared:  0.01056 
## F-statistic:   136 on 1 and 12649 DF,  p-value: < 2.2e-16

16 refers to college, 12 refers to high school. negative association.

(16-12)*(-.16348)
## [1] -0.65392

Coefficients are two unknown constants that represent the intercept and slope terms in the linear model. The intercept is the expected BMI value when a certain number of years of education is obtained.

coefficients(fit)
## (Intercept)       edyrs 
##   29.062423   -0.163484

For every year of schooling obtained, the individual’s BMI can decrease by 0.163484. The Residual Standard Error which measures the variance that the BMI can vary from the true linear regression line, is 5.153. This would give a percentage error of 17.73%

confint(fit,level = 0.99)
##                  0.5 %     99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs       -0.1995982 -0.1273697

The F-statistic at 136 is significantly larger than 1, which indicates that there may be a relationship between Years of Schooling and BMI.

bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.28   26.77   27.10   27.10   27.26   29.06
predict(fit,data.frame(edyrs=seq(0,17)))
##        1        2        3        4        5        6        7        8 
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803 
##        9       10       11       12       13       14       15       16 
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016 
##       17       18 
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
##         fit      lwr      upr
## 1  29.06242 28.72018 29.40466
## 2  28.89894 28.58313 29.21475
## 3  28.73545 28.44589 29.02502
## 4  28.57197 28.30839 28.83555
## 5  28.40849 28.17055 28.64642
## 6  28.24500 28.03226 28.45774
## 7  28.08152 27.89333 28.26971
## 8  27.91803 27.75347 28.08260
## 9  27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit      lwr      upr
## 1  29.06242 18.95654 39.16831
## 2  28.89894 18.79392 39.00396
## 3  28.73545 18.63122 38.83969
## 4  28.57197 18.46845 38.67550
## 5  28.40849 18.30560 38.51138
## 6  28.24500 18.14268 38.34733
## 7  28.08152 17.97968 38.18336
## 8  27.91803 17.81661 38.01946
## 9  27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")

###Prediction accuracy

cor(bmi,bmi.hat)
## [1] 0.1031405

Mean absolute percentage error

mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649

Assumption Linearity

plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")

###Assumption that mean of residuals is zero

mean(bmi.resid)
## [1] -2.980676e-17

Assumption Homoscedasticity of residuals

plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

###Assumption Normality of residuals

qqnorm(bmi.resid)
qqline(bmi.resid,col="red")

bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")

###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected

cor.test(edyrs,bmi.resid)
## 
##  Pearson's product-moment correlation
## 
## data:  edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01742582  0.01742582
## sample estimates:
##         cor 
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)

bmi.sqrt = sqrt(bmi)
mean(bmi.sqrt)
## [1] 5.183408
sd(bmi.sqrt)
## [1] 0.4794398
boxplot(bmi.sqrt,ylab="BMI SQRT")

plot(density(bmi.sqrt),main = "Density Plot: BMI SQRT", xlab = "BMI")

qqnorm(bmi.sqrt,main = "Population Distribution of BMI SQRT")
qqline(bmi.sqrt)

plot(edyrs,bmi.sqrt,xlab="years of schooling",ylab="BMI SQRT")

fit3=lm(bmi.sqrt~edyrs)
summary(fit3)
## 
## Call:
## lm(formula = bmi.sqrt ~ edyrs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6515 -0.3157 -0.0456  0.2587  5.0111 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.365757   0.016159  332.06   <2e-16 ***
## edyrs       -0.015172   0.001297  -11.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4769 on 12649 degrees of freedom
## Multiple R-squared:  0.0107, Adjusted R-squared:  0.01062 
## F-statistic: 136.8 on 1 and 12649 DF,  p-value: < 2.2e-16

16 refers to college, 12 refers to high school. negative association.

(16-12)*(-0.015172)
## [1] -0.060688

Through the BMI square root model, for every year of schooling obtained, the individual’s BMI can decrease by 0.015172. The Residual Standard Error which measures the variance that the BMI can vary from the true linear regression line, is 0.4769. This would give a percentage error of 8.88%.

coefficients(fit3)
## (Intercept)       edyrs 
##  5.36575684 -0.01517213

The F-statistic at 136.8 is significantly larger than 1, which indicates that there may be a relationship between Years of Schooling and BMI.

confint(fit3,level = 0.99)
##                   0.5 %      99.5 %
## (Intercept)  5.32412712  5.40738656
## edyrs       -0.01851453 -0.01182973
bmi.sqrt.hat=fitted(fit3)
bmi.sqrt.resid3=resid(fit3)
summary(bmi.sqrt.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.108   5.153   5.184   5.183   5.199   5.366
predict(fit3,data.frame(edyrs=seq(0,17)))
##        1        2        3        4        5        6        7        8 
## 5.365757 5.350585 5.335413 5.320240 5.305068 5.289896 5.274724 5.259552 
##        9       10       11       12       13       14       15       16 
## 5.244380 5.229208 5.214036 5.198863 5.183691 5.168519 5.153347 5.138175 
##       17       18 
## 5.123003 5.107831
predict(fit3,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
##         fit      lwr      upr
## 1  5.365757 5.334082 5.397431
## 2  5.350585 5.321357 5.379813
## 3  5.335413 5.308613 5.362212
## 4  5.320240 5.295846 5.344635
## 5  5.305068 5.283047 5.327089
## 6  5.289896 5.270207 5.309585
## 7  5.274724 5.257307 5.292141
## 8  5.259552 5.244322 5.274782
## 9  5.244380 5.231207 5.257552
## 10 5.229208 5.217894 5.240521
## 11 5.214036 5.204267 5.223804
## 12 5.198863 5.190158 5.207569
## 13 5.183691 5.175380 5.192002
## 14 5.168519 5.159842 5.177197
## 15 5.153347 5.143628 5.163066
## 16 5.138175 5.126925 5.149425
## 17 5.123003 5.109904 5.136102
## 18 5.107831 5.092680 5.122982
predict(fit3,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit      lwr      upr
## 1  5.365757 4.430448 6.301066
## 2  5.350585 4.415356 6.285814
## 3  5.335413 4.400256 6.270569
## 4  5.320240 4.385150 6.255331
## 5  5.305068 4.370037 6.240100
## 6  5.289896 4.354917 6.224876
## 7  5.274724 4.339790 6.209658
## 8  5.259552 4.324656 6.194448
## 9  5.244380 4.309515 6.179245
## 10 5.229208 4.294367 6.164048
## 11 5.214036 4.279212 6.148859
## 12 5.198863 4.264051 6.133676
## 13 5.183691 4.248882 6.118500
## 14 5.168519 4.233707 6.103332
## 15 5.153347 4.218524 6.088170
## 16 5.138175 4.203335 6.073015
## 17 5.123003 4.188139 6.057867
## 18 5.107831 4.172936 6.042726
ci.lwr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit3)
lines(sort(edyrs),ci.lwr,lty=3,col="orange")
lines(sort(edyrs),ci.upr,lty=3,col="orange")
lines(sort(edyrs),pi.lwr,lty=2,col="purple")
lines(sort(edyrs),pi.upr,lty=2,col="purple")

###Prediction accuracy

cor(bmi,bmi.sqrt.hat)
## [1] 0.1031405

Mean absolute percentage error

mean(abs((bmi - bmi.sqrt.hat))/bmi)
## [1] 0.8023777

Assumption Linearity

plot(edyrs,bmi.sqrt,xlab = "Years of Schooling",ylab = "BMI SQRT")
abline(fit3,col="purple")

###Assumption that mean of residuals is zero

plot(bmi.sqrt.hat,bmi.sqrt.resid3,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

###Assumption Normality of residuals

qqnorm(bmi.sqrt.resid3)
qqline(bmi.sqrt.resid3,col="red")

bmi.sqrt.stdresid3 = rstandard(fit)
qqnorm(bmi.sqrt.stdresid3,ylab = "standardized residuals")
qqline(bmi.sqrt.stdresid3,col="red")

###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected

cor.test(edyrs,bmi.sqrt.resid3)
## 
##  Pearson's product-moment correlation
## 
## data:  edyrs and bmi.sqrt.resid3
## t = 9.8536e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01742582  0.01742582
## sample estimates:
##          cor 
## 8.761228e-16

Results indicate a negative correlation between Years of schooling and BMI where an increase in Years of Schooling reveals a decrease in BMI, and a lower number of years of schooling indicate a higher BMI.

par(mfrow=c(2,2))
plot(fit3)