Tasks 1 & 2

My independent variable will be annual family income. I think that annual family income will have a negative relationship with BMI, the dependent variable, as individuals with higher family incomes will have more access to nutritious foods. I do not think the relationship is causal or that the two variables have a perfectly linear relationship, since other factors may also impact BMI. After importing the data, I filtered out NA values for BMI and values above 76 for annual family income, since those indicated missing values as well.

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
NHANES <- read.csv("/Users/Nazija/Desktop/NHANES.csv")
#head(NHANES)
#data = data.frame(NHANES$bmxbmi, NHANES$indhhin2)
data <-NHANES%>%
  select(bmxbmi, indfmin2)%>%
  filter(indfmin2 <= 15)%>%
  filter(!is.na(bmxbmi))
#data
attach(data)

Task 3

From the histogram and qqplot of BMI, it seems to be skewed to the right with most people having a BMI between 20 to 40, but a few outliers with much higher BMIS.

hist(bmxbmi)

qqnorm(bmxbmi)

A qqplot is not the most appropriate since annual household income is an interval variable, but from the histogram, it is clear that annual household income is not normally distributed.

hist(indfmin2)

qqnorm(indfmin2)

Task 4

fit = lm(bmxbmi~indfmin2)
summary(fit)
## 
## Call:
## lm(formula = bmxbmi ~ indfmin2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.387  -5.962  -0.789   4.461  56.213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26.06171    0.17195 151.566  < 2e-16 ***
## indfmin2    -0.08720    0.01901  -4.588 4.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.756 on 8212 degrees of freedom
## Multiple R-squared:  0.002556,   Adjusted R-squared:  0.002435 
## F-statistic: 21.05 on 1 and 8212 DF,  p-value: 4.552e-06

Relationship between the two variables:

BMI hat = 26.06 + (-.09)HouseholdIncome. For every 1-point increase in the household income scale, BMI will decrease by 9%.

The R-squared statistic is .003. It is not near 1, so this is not a good model

#anova(fit)
#confint(fit, level = 0.95)

Task 5

Predict the average BMI for family income levels 1-15

predicted_values = predict(fit, data.frame(indfmin2 = seq(1,15)))
predicted_values
##        1        2        3        4        5        6        7        8 
## 25.97451 25.88731 25.80011 25.71291 25.62571 25.53851 25.45132 25.36412 
##        9       10       11       12       13       14       15 
## 25.27692 25.18972 25.10252 25.01532 24.92812 24.84092 24.75372

95% confidence bands for the predicted mean BMIs

predict(fit, data.frame(indfmin2 = seq(1,15)), level = .95, interval = "c")
##         fit      lwr      upr
## 1  25.97451 25.66920 26.27982
## 2  25.88731 25.61237 26.16225
## 3  25.80011 25.55364 26.04658
## 4  25.71291 25.49228 25.93355
## 5  25.62571 25.42725 25.82418
## 6  25.53851 25.35721 25.71982
## 7  25.45132 25.28063 25.62200
## 8  25.36412 25.19627 25.53196
## 9  25.27692 25.10375 25.45009
## 10 25.18972 25.00376 25.37567
## 11 25.10252 24.89771 25.30733
## 12 25.01532 24.78708 25.24356
## 13 24.92812 24.67314 25.18310
## 14 24.84092 24.55682 25.12502
## 15 24.75372 24.43878 25.06866

95% prediction bands for the predicted individual BMIs

predict(fit, data.frame(indfmin2 = seq(1,15)), level = .95, interval = "p")
##         fit       lwr      upr
## 1  25.97451 10.768135 41.18088
## 2  25.88731 10.681515 41.09311
## 3  25.80011 10.594804 41.00542
## 4  25.71291 10.508002 40.91782
## 5  25.62571 10.421109 40.83032
## 6  25.53851 10.334124 40.74290
## 7  25.45132 10.247048 40.65558
## 8  25.36412 10.159881 40.56835
## 9  25.27692 10.072623 40.48121
## 10 25.18972  9.985273 40.39417
## 11 25.10252  9.897832 40.30721
## 12 25.01532  9.810299 40.22034
## 13 24.92812  9.722675 40.13357
## 14 24.84092  9.634960 40.04689
## 15 24.75372  9.547154 39.96030

Plot the predicted average BMIs

plot(predicted_values)

Plot the 95% confidence and prediction bands

CI_lwr = predict(fit,data.frame(indfmin2 = sort(indfmin2)), level = .95, interval = "c")[,2]
CI_upr = predict(fit,data.frame(indfmin2 = sort(indfmin2)), level = .95, interval = "c")[,3]
PI_lwr = predict(fit,data.frame(indfmin2 = sort(indfmin2)), level = .95, interval = "p")[,2]
PI_upr = predict(fit,data.frame(indfmin2 = sort(indfmin2)), level = .95, interval = "p")[,3]
plot(indfmin2, bmxbmi, ylim = range(PI_lwr, PI_upr), type = "n")
abline(fit)
lines(sort(indfmin2), CI_lwr, lty = 3, col = "red")
lines(sort(indfmin2), CI_upr, lty = 3, col = "red")
lines(sort(indfmin2), PI_lwr, lty = 2, col = "blue")
lines(sort(indfmin2), PI_upr, lty = 2, col = "blue")