# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 1 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ SCRIPT FROM CLASS SESSION ~
# ~ Annotated Solution Key ~
# ~ with added comments and ~
# ~ alternate possible coding ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Code has been updated to include
# more plotting commands for regression
# BRFSS Data:
# The Behavioral Risk Factor Surveillance System (BRFSS)
# is the nation's premier system of health-related telephone
# surveys that collect state data about U.S. residents regarding
# their health-related risk behaviors, chronic health conditions,
# and use of preventive services.
# This exercise examines a subset of variables related to 30-day alcohol
# consumption frequency and BMI in respondents from North Carolina
# who were low-quantity alcohol drinkers in the 2018 BRFSS cohort.
# 2018 Cohort Charcteristics
# Overall cohort size = 437436 observations of 275 variables
# Restricting cohort to NC = 4729 observations
# *** subset(brfss2018,`_STATE`==37) ***
# Restricting cohort to those who drank at least one drink of
# alcohol in last 30 days = 2084 observations
# *** subset(nc.brfss,DRNKANY5==1) ***
# Restricting cohort to low-quantity (non-heavy) alcohol
# drinkers = 1759 observations
# *** subset(l30.nc.brfss,`_RFDRHV6`==1) ***
# Thus, for this exercise, we have the data frame brfssm1d1
# with 1759 observations and have included the following variables:
# 1. BMI (units=kilograms per meter squared)
# 2. DAYS.DRINKING (# of Days on which had at least 1 alcoholic drink in past 30 days)
# 3. AGE (in years, collapsed above 80)
# 4. SLEEP (average # hours sleep in a 24 hour period)
# 5. EXERCISE.YN (Y/N participate in any physical activites/exercise during past month)
# 6. FEMALE (Sex at birth; 0=MALE, 1=FEMALE)
## Download and load the data file = brfssm1d1
download.file("http://www.duke.edu/~sgrambow/crp241data/brfssm1d1.RData",
destfile = "brfssm1d1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("brfssm1d1.RData")
# ---Question 1--- #
# Is there evidence of a relationship between percent BMI and DRINKING.DAYS?
# Create a scatterplot to address this question.
# ANSWER:
plot(brfssm1d1$DAYS.DRINKING,brfssm1d1$BMI)

# NOTE 1 -- when creating scatterplots in the context of regression
# usually put X variable on the horizontal axis.
# First variable listed in the plot statement using
# coding approach above will be plotted on the horizontal axis!
#
# NOTE 2 -- Alternate coding. You can also do the following
# which looks like and is more consistent with the
# syntax we use when specifying the linear model
# using the lm() function. This coding will still
# get the y variable on the y-axis and the x varialbe
# on the x-axis (horizontal).
#
plot(brfssm1d1$BMI ~ brfssm1d1$DAYS.DRINKING)

#
# SO, based on the scatterplot, there does appear to be
# a negative linear relationship between BMI
# and DAYS.DRINKING but it doesn't appear in the plot to
# be particularly strong.
# ---Question 2--- #
# Fit a linear regression model for BMI as a function of DAYS.DRINKING
# Does the data suggest that these two variables are associated
# at significance level 0.05?
# ANSWER:
# Let's create a regression object and call it 'bmi.fit'
# You could call it something else like 'fit'
# the summary() function gets you the standard regression
# coefficient table which provides the key p-value needed
# to address this question.
bmi.fit <- lm(BMI ~ DAYS.DRINKING, data=brfssm1d1)
summary(bmi.fit)
##
## Call:
## lm(formula = BMI ~ DAYS.DRINKING, data = brfssm1d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0142 -3.9217 -0.8284 2.5337 28.1233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.38158 0.18121 156.625 < 2e-16 ***
## DAYS.DRINKING -0.09742 0.01931 -5.044 5.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.757 on 1694 degrees of freedom
## (63 observations deleted due to missingness)
## Multiple R-squared: 0.0148, Adjusted R-squared: 0.01422
## F-statistic: 25.44 on 1 and 1694 DF, p-value: 5.042e-07
# Examining the coefficient table created by the
# summary () function above, we see that the p-value
# for the DAYS.DRINKING coefficient is 5.04e-07 (or 0.000000504)
# which is definitely significant at the 0.05 level!
# ---Question 3--- #
# Consider the model fit in Question 2.
# Does the data suggest that the mean difference in
# BMI for a 10 day drinker vs. 20 day drinker could
# be as much as 1.3?
# ANSWER:
# To answer this question we need to rescale the slope
# coefficient from a 1 day change in drinking frequency to a 10 day change
# in drinking frequency. In linear regression this is straightforward
# and can be found by simply multiplying the slope coefficient
# by 10 as shown below:
bmi.fit$coefficients*10
## (Intercept) DAYS.DRINKING
## 283.8158258 -0.9741893
# (Intercept) DAYS.DRINKING
# 283.8158258 -0.9741893
# You could also perform this calculation manually be simply
# having r multiply the originall coefficient (-0.09742) by 10
-0.09742*10
## [1] -0.9742
# this yields
# [1] -0.9742
# We are not done yet because this is just the point estimate
# for a 10 day difference in drinking days and doesn't address the precision
# of that estimate which the question is asking about.
# The phrase 'as much as 2.5%' can be interpreted to refer
# to the confience interval around the estimate. We can find the
# CI for a 10 day difference simply by multipling the original
# confidence interval by 10 as shown below:
confint(bmi.fit)*10
## 2.5 % 97.5 %
## (Intercept) 280.261694 287.3699574
## DAYS.DRINKING -1.352985 -0.5953937
# This yields
# 2.5 % 97.5 %
# (Intercept) 280.261694 287.3699574
# DAYS.DRINKING -1.352985 -0.5953937
#
# We see that the lower 95% confidence limit for a 10 day
# difference in age is -1.35 so we can conclude that YES
# the mean difference in BMI for a 10 day drinker vs.
# a 20 day drinker could be as much as 2.5%.
# Note the directionality of the difference.
# ---Question 4--- #
# Consider the model fit in Question 2.
# Using this model, estimate the mean BMI among all 25 day drinkers
# ANSWER
# Here we can use the full model, not just the slope
# to estimate the predicted mean BMI
# among all 25 day drinkers. We will use both the
# intercept and slope to do this:
# so MeanY = Beta0 + 25*Beta1
# now simply plug in the estimated parameter values
28.38158 + (25*-0.09742)
## [1] 25.94608
# This yields: 25.94608
# NOTE -- An Alternate approach would be to use
# the predict function as shown below:
predict(bmi.fit,data.frame(DAYS.DRINKING=25))
## 1
## 25.94611
# this yields 25.94608
# We can also find the confidence interval
# for this mean estimate using a slight modification of
# the predict() function:
predict(bmi.fit,data.frame(DAYS.DRINKING=25),interval = "confidence")
## fit lwr upr
## 1 25.94611 25.1749 26.71732
# Which yields:
# fit lwr upr
# 1 25.94611 25.1749 26.71732
# so the 95% CI for the mean percent
# body fat for all 25 day drinkers is (25.17, 26.72)
# ---Question 5--- #
# Consider the model fit in Question 2.
# Using this model, predict the BMI for a single 25 day drinker
# Is this point estimate different from the point estimate in Question 4?
# Is the precision of this point estimate different?
# ANSWER:
# The point estimate is the same as in Question 4
28.38158 + (25*-0.09742)
## [1] 25.94608
# This yields: 25.94608
# However, the precision of the estimate is different.
# Intuitively, the estimate of a mean (BMI)
# will be more precise than the estimate of the BMI
# for a single individual. # We can find the prediction
# interval for an individual using a slight modification of
# the predict() function used above in Q5:
predict(bmi.fit,data.frame(DAYS.DRINKING=25),interval = "predict")
## fit lwr upr
## 1 25.94611 14.62874 37.26348
# This yields:
# fit lwr upr
# 1 25.94611 14.62874 37.26348
# so the 95% PI (prediction inteval) for the BMI
# for a single 50 year old male is (14.63, 37.26)
# which is quite a bit wider than the CI we found
# above (25.17, 26.72)
# ---Question 6--- #
# Consider the model fit in Question 2.
# Would it be reasonable to use this model
# to estimate the mean BMI among all 45 day drinkers?
# The measure only looks at 30 day windows -- so from
# this perspective it also doesn't make sense.
# ANSWER
# There are no 45 day drinkers in the cohort/dataset
# so this would not be reasonable!
# Plotting for Linear Regression
# Now we want to use the regression object bmi.fit
# 1 create scatter plot
# 2 overlay fitted regression line
# 3 add confidence bands
# 4 add prediction bands
# code below based on: https://rpubs.com/Bio-Geek/71339
# our regression model is in the bmi.fit object
# create scatter plot
plot(brfssm1d1$DAYS.DRINKING, brfssm1d1$BMI,
xlab="DAYS DRINKING", ylab="BMI",
main="Regression Plot", col="gray")
# add regression line
abline(bmi.fit, col="lightblue")
# calculated confidence bands
# need sequence of numbers to generate confidence bands
newx <- seq(0,30,by=1)
conf_interval <- predict(bmi.fit, newdata=data.frame(DAYS.DRINKING=newx),
interval="confidence",level = 0.95)
lines(newx, conf_interval[,2], col="blue", lty=2)
lines(newx, conf_interval[,3], col="blue", lty=2)
pred_interval <- predict(bmi.fit, newdata=data.frame(DAYS.DRINKING=newx),
interval="prediction",level = 0.95)
lines(newx, pred_interval[,2], col="darkorange", lty=2)
lines(newx, pred_interval[,3], col="darkorange", lty=2)
# Here is fancier code that produces a prettier plot
# again, source code can be found here: https://rpubs.com/Bio-Geek/71339
library(ggplot2)

# Plotting 95% Confidence Interval
ggplot(brfssm1d1, aes(x=DAYS.DRINKING, y=BMI))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
## Warning: Removed 63 rows containing non-finite values (stat_smooth).
## Warning: Removed 63 rows containing missing values (geom_point).

# Plotting 95% Confidence Interval and Prediction Interval
temp_var <- predict(bmi.fit, interval="prediction")
## Warning in predict.lm(bmi.fit, interval = "prediction"): predictions on current data refer to _future_ responses
brf_data <- brfssm1d1[complete.cases(brfssm1d1$BMI),]
newdata <- cbind(brf_data,temp_var)
ggplot(newdata, aes(DAYS.DRINKING, BMI))+
geom_point() +
geom_line(aes(y=lwr), color = "red", linetype = "dashed")+
geom_line(aes(y=upr), color = "red", linetype = "dashed")+
geom_smooth(method=lm, se=TRUE)

# End of Program