library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
## hhidpn = col_integer(),
## raedyrs = col_integer(),
## raeduc = col_character(),
## rameduc = col_integer(),
## rafeduc = col_integer(),
## ragender = col_character(),
## raracem = col_character(),
## r1agey_b = col_integer(),
## r1agey_e = col_integer(),
## r1agey_m = col_integer(),
## r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12553 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5 '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)
hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
## NA's :1
hrs.new = na.omit(hrs.new)
summary(hrs.new)
## hrs.r1bmi hrs.raedyrs
## Min. : 12.8 Min. : 0.00
## 1st Qu.: 23.6 1st Qu.:11.00
## Median : 26.5 Median :12.00
## Mean : 27.1 Mean :12.02
## 3rd Qu.: 29.6 3rd Qu.:14.00
## Max. :102.7 Max. :17.00
colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")

plot(edyrs,bmi)

plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")

qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)

boxplot(bmi,ylab="BMI")

plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")

fit=lm(bmi~edyrs)
summary(fit)
##
## Call:
## lm(formula = bmi ~ edyrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.791 -3.401 -0.701 2.517 76.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06242 0.17460 166.45 <2e-16 ***
## edyrs -0.16348 0.01402 -11.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared: 0.01064, Adjusted R-squared: 0.01056
## F-statistic: 136 on 1 and 12649 DF, p-value: < 2.2e-16
16 refers to college, 12 refers to high school. negative association.
(16-12)*(-.16348)
## [1] -0.65392
coefficients(fit)
## (Intercept) edyrs
## 29.062423 -0.163484
confint(fit,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs -0.1995982 -0.1273697
bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.28 26.77 27.10 27.10 27.26 29.06
predict(fit,data.frame(edyrs=seq(0,17)))
## 1 2 3 4 5 6 7 8
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803
## 9 10 11 12 13 14 15 16
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016
## 17 18
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 29.06242 18.95654 39.16831
## 2 28.89894 18.79392 39.00396
## 3 28.73545 18.63122 38.83969
## 4 28.57197 18.46845 38.67550
## 5 28.40849 18.30560 38.51138
## 6 28.24500 18.14268 38.34733
## 7 28.08152 17.97968 38.18336
## 8 27.91803 17.81661 38.01946
## 9 27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
## fit lwr upr
## 1 29.06242 28.72018 29.40466
## 2 28.89894 28.58313 29.21475
## 3 28.73545 28.44589 29.02502
## 4 28.57197 28.30839 28.83555
## 5 28.40849 28.17055 28.64642
## 6 28.24500 28.03226 28.45774
## 7 28.08152 27.89333 28.26971
## 8 27.91803 27.75347 28.08260
## 9 27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
## fit lwr upr
## 1 29.06242 18.95654 39.16831
## 2 28.89894 18.79392 39.00396
## 3 28.73545 18.63122 38.83969
## 4 28.57197 18.46845 38.67550
## 5 28.40849 18.30560 38.51138
## 6 28.24500 18.14268 38.34733
## 7 28.08152 17.97968 38.18336
## 8 27.91803 17.81661 38.01946
## 9 27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")

Prediction accuracy
cor(bmi,bmi.hat)
## [1] 0.1031405
Mean absolute percentage error
mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649
Assumption Linearity
plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")

Assumption that mean of residuals is zero
mean(bmi.resid)
## [1] -2.980676e-17
Assumption Homoscedasticity of residuals
plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

Assumption that mean of residuals is zero
mean(bmi.resid)
## [1] -2.980676e-17
Assumption Homoscedasticity of residuals
plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

Assumption Normality of residuals
qqnorm(bmi.resid)
qqline(bmi.resid,col="red")

bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")

Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected
cor.test(edyrs,bmi.resid)
##
## Pearson's product-moment correlation
##
## data: edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01742582 0.01742582
## sample estimates:
## cor
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)

bmi.log = log(bmi)
mean(bmi.log)
## [1] 3.28265
sd(bmi.log)
## [1] 0.1807348
hist(bmi.log, col = "green")

plot(bmi.log,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="blue")
