library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
## hhidpn = col_integer(),
## raedyrs = col_integer(),
## raeduc = col_character(),
## rameduc = col_integer(),
## rafeduc = col_integer(),
## ragender = col_character(),
## raracem = col_character(),
## r1agey_b = col_integer(),
## r1agey_e = col_integer(),
## r1agey_m = col_integer(),
## r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12553 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)
hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
## NA's :1
hrs.new = na.omit(hrs.new)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")
plot(edyrs,bmi)
plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")
qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)
boxplot(bmi,ylab="BMI")
plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")
### Linear Model on the effect that Years of Education has on Body Mass Index of a population using the Health Retirement Study dataset. Predictor is the Years of Education, and the target response variable is the Body Mass Index. More generally, this seeks to measure the correlation between educational attainment and obesity.
fit=lm(bmi~edyrs)
summary(fit)
##
## Call:
## lm(formula = bmi ~ edyrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.791 -3.401 -0.701 2.517 76.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06242 0.17460 166.45 <2e-16 ***
## edyrs -0.16348 0.01402 -11.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared: 0.01064, Adjusted R-squared: 0.01056
## F-statistic: 136 on 1 and 12649 DF, p-value: < 2.2e-16
(16-12)*(-.16348)
## [1] -0.65392
coefficients(fit)
## (Intercept) edyrs
## 29.062423 -0.163484
confint(fit,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs -0.1995982 -0.1273697
bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.28 26.77 27.10 27.10 27.26 29.06
predict(fit,data.frame(edyrs=seq(0,17)))
## 1 2 3 4 5 6 7 8
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803
## 9 10 11 12 13 14 15 16
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016
## 17 18
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
## fit lwr upr
## 1 29.06242 28.72018 29.40466
## 2 28.89894 28.58313 29.21475
## 3 28.73545 28.44589 29.02502
## 4 28.57197 28.30839 28.83555
## 5 28.40849 28.17055 28.64642
## 6 28.24500 28.03226 28.45774
## 7 28.08152 27.89333 28.26971
## 8 27.91803 27.75347 28.08260
## 9 27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 29.06242 18.95654 39.16831
## 2 28.89894 18.79392 39.00396
## 3 28.73545 18.63122 38.83969
## 4 28.57197 18.46845 38.67550
## 5 28.40849 18.30560 38.51138
## 6 28.24500 18.14268 38.34733
## 7 28.08152 17.97968 38.18336
## 8 27.91803 17.81661 38.01946
## 9 27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")
###Prediction accuracy
cor(bmi,bmi.hat)
## [1] 0.1031405
mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649
plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")
###Assumption that mean of residuals is zero
mean(bmi.resid)
## [1] -2.980676e-17
plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")
###Assumption Normality of residuals
qqnorm(bmi.resid)
qqline(bmi.resid,col="red")
bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")
###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected
cor.test(edyrs,bmi.resid)
##
## Pearson's product-moment correlation
##
## data: edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01742582 0.01742582
## sample estimates:
## cor
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)
bmi.sqrt = sqrt(bmi)
mean(bmi.sqrt)
## [1] 5.183408
sd(bmi.sqrt)
## [1] 0.4794398
boxplot(bmi.sqrt,ylab="BMI SQRT")
plot(density(bmi.sqrt),main = "Density Plot: BMI SQRT", xlab = "BMI")
qqnorm(bmi.sqrt,main = "Population Distribution of BMI SQRT")
qqline(bmi.sqrt)
plot(edyrs,bmi.sqrt,xlab="years of schooling",ylab="BMI SQRT")
fit3=lm(bmi.sqrt~edyrs)
summary(fit3)
##
## Call:
## lm(formula = bmi.sqrt ~ edyrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6515 -0.3157 -0.0456 0.2587 5.0111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.365757 0.016159 332.06 <2e-16 ***
## edyrs -0.015172 0.001297 -11.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4769 on 12649 degrees of freedom
## Multiple R-squared: 0.0107, Adjusted R-squared: 0.01062
## F-statistic: 136.8 on 1 and 12649 DF, p-value: < 2.2e-16
(16-12)*(-0.015172)
## [1] -0.060688
coefficients(fit3)
## (Intercept) edyrs
## 5.36575684 -0.01517213
confint(fit3,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 5.32412712 5.40738656
## edyrs -0.01851453 -0.01182973
bmi.sqrt.hat=fitted(fit3)
bmi.sqrt.resid3=resid(fit3)
summary(bmi.sqrt.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.108 5.153 5.184 5.183 5.199 5.366
predict(fit3,data.frame(edyrs=seq(0,17)))
## 1 2 3 4 5 6 7 8
## 5.365757 5.350585 5.335413 5.320240 5.305068 5.289896 5.274724 5.259552
## 9 10 11 12 13 14 15 16
## 5.244380 5.229208 5.214036 5.198863 5.183691 5.168519 5.153347 5.138175
## 17 18
## 5.123003 5.107831
predict(fit3,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
## fit lwr upr
## 1 5.365757 5.334082 5.397431
## 2 5.350585 5.321357 5.379813
## 3 5.335413 5.308613 5.362212
## 4 5.320240 5.295846 5.344635
## 5 5.305068 5.283047 5.327089
## 6 5.289896 5.270207 5.309585
## 7 5.274724 5.257307 5.292141
## 8 5.259552 5.244322 5.274782
## 9 5.244380 5.231207 5.257552
## 10 5.229208 5.217894 5.240521
## 11 5.214036 5.204267 5.223804
## 12 5.198863 5.190158 5.207569
## 13 5.183691 5.175380 5.192002
## 14 5.168519 5.159842 5.177197
## 15 5.153347 5.143628 5.163066
## 16 5.138175 5.126925 5.149425
## 17 5.123003 5.109904 5.136102
## 18 5.107831 5.092680 5.122982
predict(fit3,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 5.365757 4.430448 6.301066
## 2 5.350585 4.415356 6.285814
## 3 5.335413 4.400256 6.270569
## 4 5.320240 4.385150 6.255331
## 5 5.305068 4.370037 6.240100
## 6 5.289896 4.354917 6.224876
## 7 5.274724 4.339790 6.209658
## 8 5.259552 4.324656 6.194448
## 9 5.244380 4.309515 6.179245
## 10 5.229208 4.294367 6.164048
## 11 5.214036 4.279212 6.148859
## 12 5.198863 4.264051 6.133676
## 13 5.183691 4.248882 6.118500
## 14 5.168519 4.233707 6.103332
## 15 5.153347 4.218524 6.088170
## 16 5.138175 4.203335 6.073015
## 17 5.123003 4.188139 6.057867
## 18 5.107831 4.172936 6.042726
ci.lwr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit3,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit3)
lines(sort(edyrs),ci.lwr,lty=3,col="orange")
lines(sort(edyrs),ci.upr,lty=3,col="orange")
lines(sort(edyrs),pi.lwr,lty=2,col="purple")
lines(sort(edyrs),pi.upr,lty=2,col="purple")
###Prediction accuracy
cor(bmi,bmi.sqrt.hat)
## [1] 0.1031405
mean(abs((bmi - bmi.sqrt.hat))/bmi)
## [1] 0.8023777
plot(edyrs,bmi.sqrt,xlab = "Years of Schooling",ylab = "BMI SQRT")
abline(fit3,col="purple")
###Assumption that mean of residuals is zero
plot(bmi.sqrt.hat,bmi.sqrt.resid3,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")
###Assumption Normality of residuals
qqnorm(bmi.sqrt.resid3)
qqline(bmi.sqrt.resid3,col="red")
bmi.sqrt.stdresid3 = rstandard(fit)
qqnorm(bmi.sqrt.stdresid3,ylab = "standardized residuals")
qqline(bmi.sqrt.stdresid3,col="red")
###Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected
cor.test(edyrs,bmi.sqrt.resid3)
##
## Pearson's product-moment correlation
##
## data: edyrs and bmi.sqrt.resid3
## t = 9.8536e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01742582 0.01742582
## sample estimates:
## cor
## 8.761228e-16
par(mfrow=c(2,2))
plot(fit3)