library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
## hhidpn = col_integer(),
## raedyrs = col_integer(),
## raeduc = col_character(),
## rameduc = col_integer(),
## rafeduc = col_integer(),
## ragender = col_character(),
## raracem = col_character(),
## r1agey_b = col_integer(),
## r1agey_e = col_integer(),
## r1agey_m = col_integer(),
## r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12553 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)
hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
## NA's :1
hrs.new = na.omit(hrs.new)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")
plot(edyrs,bmi)
plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")
qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)
boxplot(bmi,ylab="BMI")
plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")
### Linear Model on the effect that Years of Education has on Body Mass Index of a population using the Health Retirement Study dataset. Predictor is the Years of Education, and the target response variable is the Body Mass Index. More generally, this seeks to measure the correlation between educational attainment and obesity.
fit=lm(bmi~edyrs)
summary(fit)
##
## Call:
## lm(formula = bmi ~ edyrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.791 -3.401 -0.701 2.517 76.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06242 0.17460 166.45 <2e-16 ***
## edyrs -0.16348 0.01402 -11.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared: 0.01064, Adjusted R-squared: 0.01056
## F-statistic: 136 on 1 and 12649 DF, p-value: < 2.2e-16
(16-12)*(-.16348)
## [1] -0.65392
coefficients(fit)
## (Intercept) edyrs
## 29.062423 -0.163484
confint(fit,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs -0.1995982 -0.1273697
bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.28 26.77 27.10 27.10 27.26 29.06
predict(fit,data.frame(edyrs=seq(0,17)))
## 1 2 3 4 5 6 7 8
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803
## 9 10 11 12 13 14 15 16
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016
## 17 18
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
## fit lwr upr
## 1 29.06242 28.72018 29.40466
## 2 28.89894 28.58313 29.21475
## 3 28.73545 28.44589 29.02502
## 4 28.57197 28.30839 28.83555
## 5 28.40849 28.17055 28.64642
## 6 28.24500 28.03226 28.45774
## 7 28.08152 27.89333 28.26971
## 8 27.91803 27.75347 28.08260
## 9 27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 29.06242 18.95654 39.16831
## 2 28.89894 18.79392 39.00396
## 3 28.73545 18.63122 38.83969
## 4 28.57197 18.46845 38.67550
## 5 28.40849 18.30560 38.51138
## 6 28.24500 18.14268 38.34733
## 7 28.08152 17.97968 38.18336
## 8 27.91803 17.81661 38.01946
## 9 27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")
###Prediction accuracy
cor(bmi,bmi.hat)
## [1] 0.1031405
mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649
plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")
###Assumption that mean of residuals is zero
mean(bmi.resid)
## [1] -2.980676e-17
plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")
###Assumption Normality of residuals
qqnorm(bmi.resid)
qqline(bmi.resid,col="red")
bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")
###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected
cor.test(edyrs,bmi.resid)
##
## Pearson's product-moment correlation
##
## data: edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01742582 0.01742582
## sample estimates:
## cor
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)
bmi2 = bmi^2
mean(bmi2)
## [1] 761.1097
sd(bmi2)
## [1] 324.7898
boxplot(bmi2,ylab="BMI SQUARED")
plot(density(bmi2),main = "Density Plot: BMI SQUARED", xlab = "BMI")
qqnorm(bmi2,main = "Population Distribution of BMI SQUARED")
qqline(bmi2)
plot(edyrs,bmi2,xlab="years of schooling",ylab="BMI SQUARED")
fit4=lm(bmi2~edyrs)
summary(fit4)
##
## Call:
## lm(formula = bmi2 ~ edyrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -626.7 -199.6 -64.3 116.4 9824.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 877.0285 10.9539 80.06 <2e-16 ***
## edyrs -9.6449 0.8795 -10.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 323.3 on 12649 degrees of freedom
## Multiple R-squared: 0.009418, Adjusted R-squared: 0.00934
## F-statistic: 120.3 on 1 and 12649 DF, p-value: < 2.2e-16
(16-12)*(-9.6449)
## [1] -38.5796
coefficients(fit4)
## (Intercept) edyrs
## 877.028550 -9.644913
confint(fit4,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 848.80887 905.248226
## edyrs -11.91064 -7.379187
bmi2.hat=fitted(fit4)
bmi2.resid4=resid(fit4)
summary(bmi2.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 713.1 742.0 761.3 761.1 770.9 877.0
predict(fit4,data.frame(edyrs=seq(0,17)))
## 1 2 3 4 5 6 7 8
## 877.0285 867.3836 857.7387 848.0938 838.4489 828.8040 819.1591 809.5142
## 9 10 11 12 13 14 15 16
## 799.8692 790.2243 780.5794 770.9345 761.2896 751.6447 741.9998 732.3549
## 17 18
## 722.7099 713.0650
predict(fit4,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
## fit lwr upr
## 1 877.0285 855.5572 898.4999
## 2 867.3836 847.5706 887.1966
## 3 857.7387 839.5719 875.9056
## 4 848.0938 831.5572 864.6304
## 5 838.4489 823.5216 853.3762
## 6 828.8040 815.4572 842.1508
## 7 819.1591 807.3526 830.9655
## 8 809.5142 799.1899 819.8384
## 9 799.8692 790.9399 808.7986
## 10 790.2243 782.5550 797.8937
## 11 780.5794 773.9576 787.2012
## 12 770.9345 765.0335 776.8355
## 13 761.2896 755.6558 766.9234
## 14 751.6447 745.7625 757.5269
## 15 741.9998 735.4115 748.5880
## 16 732.3549 724.7290 739.9807
## 17 722.7099 713.8304 731.5895
## 18 713.0650 702.7946 723.3355
predict(fit4,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 877.0285 243.00774 1511.049
## 2 867.3836 233.41683 1501.350
## 3 857.7387 223.82122 1491.656
## 4 848.0938 214.22094 1481.967
## 5 838.4489 204.61596 1472.282
## 6 828.8040 195.00631 1462.602
## 7 819.1591 185.39196 1452.926
## 8 809.5142 175.77293 1443.255
## 9 799.8692 166.14920 1433.589
## 10 790.2243 156.52079 1423.928
## 11 780.5794 146.88769 1414.271
## 12 770.9345 137.24990 1404.619
## 13 761.2896 127.60742 1394.972
## 14 751.6447 117.96025 1385.329
## 15 741.9998 108.30839 1375.691
## 16 732.3549 98.65184 1366.058
## 17 722.7099 88.99060 1356.429
## 18 713.0650 79.32467 1346.805
ci.lwr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit4,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit4)
lines(sort(edyrs),ci.lwr,lty=3,col="orange")
lines(sort(edyrs),ci.upr,lty=3,col="orange")
lines(sort(edyrs),pi.lwr,lty=2,col="purple")
lines(sort(edyrs),pi.upr,lty=2,col="purple")
###Prediction accuracy
cor(bmi,bmi2.hat)
## [1] 0.1031405
mean(abs((bmi - bmi2.hat))/bmi)
## [1] 28.00212
plot(edyrs,bmi2,xlab = "Years of Schooling",ylab = "BMI SQRT")
abline(fit4,col="purple")
###Assumption that mean of residuals is zero
plot(bmi2.hat,bmi2.resid4,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")
###Assumption Normality of residuals
qqnorm(bmi2.resid4)
qqline(bmi2.resid4,col="red")
bmi2.stdresid4 = rstandard(fit)
qqnorm(bmi2.stdresid4,ylab = "standardized residuals")
qqline(bmi2.stdresid4,col="red")
###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected
cor.test(edyrs,bmi2.resid4)
##
## Pearson's product-moment correlation
##
## data: edyrs and bmi2.resid4
## t = 5.9367e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01742582 0.01742582
## sample estimates:
## cor
## 5.278597e-16
par(mfrow=c(2,2))
plot(fit4)