library(readr)
hrs=read_csv("/Users/juliushunte/desktop/HRS_r1bmi.csv")
## Parsed with column specification:
## cols(
##   hhidpn = col_integer(),
##   raedyrs = col_integer(),
##   raeduc = col_character(),
##   rameduc = col_integer(),
##   rafeduc = col_integer(),
##   ragender = col_character(),
##   raracem = col_character(),
##   r1agey_b = col_integer(),
##   r1agey_e = col_integer(),
##   r1agey_m = col_integer(),
##   r1bmi = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 24 parsing failures.
## row # A tibble: 5 x 5 col     row col     expected           actual file                             expected   <int> <chr>   <chr>              <chr>  <chr>                            actual 1 12553 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… file 2 12561 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… row 3 12561 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… col 4 12562 rameduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS… expected 5 12562 rafeduc no trailing chara… .5     '/Users/juliushunte/desktop/HRS…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
head(hrs)
hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00  
##                  NA's   :1
hrs.new = na.omit(hrs.new)
summary(hrs.new)
##    hrs.r1bmi      hrs.raedyrs   
##  Min.   : 12.8   Min.   : 0.00  
##  1st Qu.: 23.6   1st Qu.:11.00  
##  Median : 26.5   Median :12.00  
##  Mean   : 27.1   Mean   :12.02  
##  3rd Qu.: 29.6   3rd Qu.:14.00  
##  Max.   :102.7   Max.   :17.00
colnames(hrs.new)=c("bmi","edyrs")
attach(hrs.new)
boxplot(bmi,ylab="BMI")

plot(edyrs,bmi)

plot(density(bmi),main = "Density Plot: BMI", xlab = "BMI")

qqnorm(bmi,main = "Population Distribution of BMI")
qqline(bmi)

boxplot(bmi,ylab="BMI")

plot(edyrs,bmi,xlab="years of schooling",ylab="BMI")

fit=lm(bmi~edyrs)
summary(fit)
## 
## Call:
## lm(formula = bmi ~ edyrs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.791  -3.401  -0.701   2.517  76.253 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.06242    0.17460  166.45   <2e-16 ***
## edyrs       -0.16348    0.01402  -11.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.153 on 12649 degrees of freedom
## Multiple R-squared:  0.01064,    Adjusted R-squared:  0.01056 
## F-statistic:   136 on 1 and 12649 DF,  p-value: < 2.2e-16

16 refers to college, 12 refers to high school. negative association.

(16-12)*(-.16348)
## [1] -0.65392
coefficients(fit)
## (Intercept)       edyrs 
##   29.062423   -0.163484
confint(fit,level = 0.99)
##                  0.5 %     99.5 %
## (Intercept) 28.6126192 29.5122262
## edyrs       -0.1995982 -0.1273697
bmi.hat=fitted(fit)
bmi.resid=resid(fit)
summary(bmi.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.28   26.77   27.10   27.10   27.26   29.06
predict(fit,data.frame(edyrs=seq(0,17)))
##        1        2        3        4        5        6        7        8 
## 29.06242 28.89894 28.73545 28.57197 28.40849 28.24500 28.08152 27.91803 
##        9       10       11       12       13       14       15       16 
## 27.75455 27.59107 27.42758 27.26410 27.10062 26.93713 26.77365 26.61016 
##       17       18 
## 26.44668 26.28320
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit      lwr      upr
## 1  29.06242 18.95654 39.16831
## 2  28.89894 18.79392 39.00396
## 3  28.73545 18.63122 38.83969
## 4  28.57197 18.46845 38.67550
## 5  28.40849 18.30560 38.51138
## 6  28.24500 18.14268 38.34733
## 7  28.08152 17.97968 38.18336
## 8  27.91803 17.81661 38.01946
## 9  27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "c")
##         fit      lwr      upr
## 1  29.06242 28.72018 29.40466
## 2  28.89894 28.58313 29.21475
## 3  28.73545 28.44589 29.02502
## 4  28.57197 28.30839 28.83555
## 5  28.40849 28.17055 28.64642
## 6  28.24500 28.03226 28.45774
## 7  28.08152 27.89333 28.26971
## 8  27.91803 27.75347 28.08260
## 9  27.75455 27.61222 27.89688
## 10 27.59107 27.46882 27.71331
## 11 27.42758 27.32204 27.53313
## 12 27.26410 27.17004 27.35816
## 13 27.10062 27.01082 27.19041
## 14 26.93713 26.84337 27.03089
## 15 26.77365 26.66863 26.87866
## 16 26.61016 26.48861 26.73171
## 17 26.44668 26.30515 26.58821
## 18 26.28320 26.11949 26.44690
predict(fit,data.frame(edyrs=seq(0,17)),level=.95,interval = "p")
##         fit      lwr      upr
## 1  29.06242 18.95654 39.16831
## 2  28.89894 18.79392 39.00396
## 3  28.73545 18.63122 38.83969
## 4  28.57197 18.46845 38.67550
## 5  28.40849 18.30560 38.51138
## 6  28.24500 18.14268 38.34733
## 7  28.08152 17.97968 38.18336
## 8  27.91803 17.81661 38.01946
## 9  27.75455 17.65346 37.85564
## 10 27.59107 17.49024 37.69189
## 11 27.42758 17.32695 37.52822
## 12 27.26410 17.16357 37.36462
## 13 27.10062 17.00013 37.20110
## 14 26.93713 16.83661 37.03765
## 15 26.77365 16.67301 36.87428
## 16 26.61016 16.50935 36.71098
## 17 26.44668 16.34560 36.54776
## 18 26.28320 16.18178 36.38461
ci.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,2]
ci.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "c")[,3]
pi.lwr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,2]
pi.upr=predict(fit,data.frame(edyrs=sort(edyrs)),level=.95,interval = "p")[,3]
plot(edyrs,bmi,ylim = range(pi.lwr,pi.upr),type = "n")
abline(fit)
lines(sort(edyrs),ci.lwr,lty=3,col="red")
lines(sort(edyrs),ci.upr,lty=3,col="red")
lines(sort(edyrs),pi.lwr,lty=2,col="blue")
lines(sort(edyrs),pi.upr,lty=2,col="blue")

Prediction accuracy

cor(bmi,bmi.hat)
## [1] 0.1031405

Mean absolute percentage error

mean(abs((bmi - bmi.hat))/bmi)
## [1] 0.1432649

Assumption Linearity

plot(edyrs,bmi,xlab = "Years of Schooling",ylab = "BMI")
abline(fit,col="red")

Assumption that mean of residuals is zero

mean(bmi.resid)
## [1] -2.980676e-17

Assumption Homoscedasticity of residuals

plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

Assumption that mean of residuals is zero

mean(bmi.resid)
## [1] -2.980676e-17

Assumption Homoscedasticity of residuals

plot(bmi.hat,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="red")

Assumption Normality of residuals

qqnorm(bmi.resid)
qqline(bmi.resid,col="red")

bmi.stdresid = rstandard(fit)
qqnorm(bmi.stdresid,ylab = "standardized residuals")
qqline(bmi.stdresid,col="red")

Assumption No correlation, null hypothesis that true correlation is zero cannot be rejected

cor.test(edyrs,bmi.resid)
## 
##  Pearson's product-moment correlation
## 
## data:  edyrs and bmi.resid
## t = 9.7423e-14, df = 12649, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01742582  0.01742582
## sample estimates:
##         cor 
## 8.66226e-16
par(mfrow=c(2,2))
plot(fit)

bmi.log = log(bmi)
mean(bmi.log)
## [1] 3.28265
sd(bmi.log)
## [1] 0.1807348
hist(bmi.log, col = "green")

plot(bmi.log,bmi.resid,xlab="fitted values",ylab = "residuals")
abline(h=0,col="blue")