# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 1 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ SCRIPT FROM CLASS SESSION ~
# ~ Annotated Solution Key    ~
# ~ with added comments and   ~
# ~ alternate possible coding ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Code has been updated to include 
# more plotting commands for regression

# BRFSS Data:
# The Behavioral Risk Factor Surveillance System (BRFSS) 
# is the nation's premier system of health-related telephone 
# surveys that collect state data about U.S. residents regarding 
# their health-related risk behaviors, chronic health conditions, 
# and use of preventive services. 

# This exercise examines a subset of variables related to 30-day alcohol
# consumption frequency and BMI in respondents from North Carolina
# who were low-quantity alcohol drinkers in the 2018 BRFSS cohort.

# 2018 Cohort Charcteristics 
# Overall cohort size = 437436 observations of 275 variables
# Restricting cohort to NC = 4729 observations
#      ***  subset(brfss2018,`_STATE`==37)  ***
# Restricting cohort to those who drank at least one drink of 
#      alcohol in last 30 days = 2084 observations
#      ***  subset(nc.brfss,DRNKANY5==1)  ***
# Restricting cohort to low-quantity (non-heavy) alcohol
#             drinkers = 1759 observations
#      ***  subset(l30.nc.brfss,`_RFDRHV6`==1)  ***

# Thus, for this exercise, we have the data frame brfssm1d1
#  with 1759 observations and have included the following variables:
# 1.  BMI             (units=kilograms per meter squared)
# 2.  DAYS.DRINKING   (# of Days on which had at least 1 alcoholic drink in past 30 days)
# 3.  AGE             (in years, collapsed above 80)
# 4.  SLEEP           (average # hours sleep in a 24 hour period)
# 5.  EXERCISE.YN     (Y/N participate in any physical activites/exercise during past month)
# 6.  FEMALE          (Sex at birth; 0=MALE, 1=FEMALE)

## Download and load the data file = brfssm1d1
download.file("http://www.duke.edu/~sgrambow/crp241data/brfssm1d1.RData",
              destfile = "brfssm1d1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("brfssm1d1.RData")
# ---Question 1--- #
# Is there evidence of a relationship between percent BMI and DRINKING.DAYS? 
# Create a scatterplot to address this question.

# ANSWER:
plot(brfssm1d1$DAYS.DRINKING,brfssm1d1$BMI)

# NOTE 1 -- when creating scatterplots in the context of regression 
#          usually put X variable on the horizontal axis.
#          First variable listed in the plot statement using 
#          coding approach above will be plotted on the horizontal axis!
#
# NOTE 2 -- Alternate coding. You can also do the following 
#           which looks like and is more consistent with the 
#           syntax we use when specifying the linear model 
#           using the lm() function.  This coding will still
#           get the y variable on the y-axis and the x varialbe
#           on the x-axis (horizontal).
#
plot(brfssm1d1$BMI ~ brfssm1d1$DAYS.DRINKING)

#
# SO, based on the scatterplot, there does appear to be 
# a negative linear relationship between BMI 
# and DAYS.DRINKING but it doesn't appear in the plot to 
# be particularly strong.

# ---Question 2--- #
# Fit a linear regression model for BMI as a function of DAYS.DRINKING 
# Does the data suggest that these two variables are associated 
# at significance level 0.05? 

# ANSWER:  
# Let's create a regression object and call it 'bmi.fit'
# You could call it something else like 'fit'
# the summary() function gets you the standard regression
# coefficient table which provides the key p-value needed
# to address this question.

bmi.fit <- lm(BMI ~ DAYS.DRINKING, data=brfssm1d1)
summary(bmi.fit)
## 
## Call:
## lm(formula = BMI ~ DAYS.DRINKING, data = brfssm1d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0142  -3.9217  -0.8284   2.5337  28.1233 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   28.38158    0.18121 156.625  < 2e-16 ***
## DAYS.DRINKING -0.09742    0.01931  -5.044 5.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.757 on 1694 degrees of freedom
##   (63 observations deleted due to missingness)
## Multiple R-squared:  0.0148, Adjusted R-squared:  0.01422 
## F-statistic: 25.44 on 1 and 1694 DF,  p-value: 5.042e-07
# Examining the coefficient table created by the
# summary () function above, we see that the p-value
# for the DAYS.DRINKING coefficient is 5.04e-07 (or 0.000000504)
# which is definitely significant at the 0.05 level!

# ---Question 3--- #
# Consider the model fit in Question 2. 
# Does the data suggest that the mean difference in 
# BMI for a 10 day drinker vs. 20 day drinker could 
# be as much as 1.3? 

# ANSWER:
# To answer this question we need to rescale the slope
# coefficient from a 1 day change in drinking frequency to a 10 day change
# in drinking frequency.  In linear regression this is straightforward
# and can be found by simply multiplying the slope coefficient
# by 10 as shown below:
  
bmi.fit$coefficients*10
##   (Intercept) DAYS.DRINKING 
##   283.8158258    -0.9741893
#  (Intercept) DAYS.DRINKING 
#  283.8158258    -0.9741893 

# You could also perform this calculation manually be simply
# having r multiply the originall coefficient (-0.09742) by 10
-0.09742*10
## [1] -0.9742
# this yields 
# [1] -0.9742

# We are not done yet because this is just the point estimate
# for a 10 day difference in drinking days and doesn't address the precision 
# of that estimate which the question is asking about.
# The phrase 'as much as 2.5%' can be interpreted to refer
# to the confience interval around the estimate.  We can find the 
# CI for a 10 day difference simply by multipling the original
# confidence interval by 10 as shown below:
confint(bmi.fit)*10
##                    2.5 %      97.5 %
## (Intercept)   280.261694 287.3699574
## DAYS.DRINKING  -1.352985  -0.5953937
# This yields
#                    2.5 %    97.5 %
# (Intercept)   280.261694 287.3699574
# DAYS.DRINKING  -1.352985  -0.5953937
#
# We see that the lower 95% confidence limit for a 10 day 
# difference in age is -1.35 so we can conclude that YES 
# the mean difference in BMI for a 10 day drinker vs. 
# a 20 day drinker could be as much as 2.5%. 
# Note the directionality of the difference.


# ---Question 4--- #
# Consider the model fit in Question 2. 
# Using this model, estimate the mean BMI among all 25 day drinkers 

# ANSWER
# Here we can use the full model, not just the slope
# to estimate the predicted mean BMI
# among all 25 day drinkers. We will use both the 
# intercept and slope to do this:
# so MeanY = Beta0 + 25*Beta1
# now simply plug in the estimated parameter values
28.38158 + (25*-0.09742)
## [1] 25.94608
# This yields: 25.94608

# NOTE -- An Alternate approach would be to use
# the predict function as shown below:
predict(bmi.fit,data.frame(DAYS.DRINKING=25))
##        1 
## 25.94611
# this yields 25.94608
# We can also find the confidence interval
# for this mean estimate using a slight modification of 
# the predict() function:

predict(bmi.fit,data.frame(DAYS.DRINKING=25),interval = "confidence")
##        fit     lwr      upr
## 1 25.94611 25.1749 26.71732
# Which yields:
#       fit      lwr      upr
# 1 25.94611 25.1749 26.71732

# so the 95% CI for the mean percent 
# body fat for all 25 day drinkers is (25.17, 26.72)

# ---Question 5--- #
# Consider the model fit in Question 2. 
# Using this model, predict the BMI for a single 25 day drinker 
# Is this point estimate different from the point estimate in Question 4? 
# Is the precision of this point estimate different?

# ANSWER:
# The point estimate is the same as in Question 4

28.38158 + (25*-0.09742)
## [1] 25.94608
# This yields: 25.94608

# However, the precision of the estimate is different.
# Intuitively, the estimate of a mean (BMI)
# will be more precise than the estimate of the BMI
# for a single individual. # We can find the prediction
# interval for an individual using a slight modification of 
# the predict() function used above in Q5:

predict(bmi.fit,data.frame(DAYS.DRINKING=25),interval = "predict")
##        fit      lwr      upr
## 1 25.94611 14.62874 37.26348
# This yields:
#        fit      lwr      upr
# 1 25.94611 14.62874 37.26348

# so the 95% PI (prediction inteval) for the BMI 
# for a single 50 year old male is (14.63, 37.26) 
# which is quite a bit wider than the CI we found 
# above (25.17, 26.72)

# ---Question 6--- #
# Consider the model fit in Question 2. 
# Would it be reasonable to use this model 
# to estimate the mean BMI among all 45 day drinkers?
# The measure only looks at 30 day windows -- so from 
# this perspective it also doesn't make sense.
  
# ANSWER
# There are no 45 day drinkers in the cohort/dataset 
# so this would not be reasonable!
# Plotting for Linear Regression
# Now we want to use the regression object bmi.fit
# 1 create scatter plot
# 2 overlay fitted regression line
# 3 add confidence bands
# 4 add prediction bands
# code below based on: https://rpubs.com/Bio-Geek/71339
# our regression model is in the bmi.fit object

# create scatter plot
plot(brfssm1d1$DAYS.DRINKING, brfssm1d1$BMI,
     xlab="DAYS DRINKING", ylab="BMI", 
     main="Regression Plot", col="gray")
# add regression line
abline(bmi.fit, col="lightblue")
# calculated confidence bands
# need sequence of numbers to generate confidence bands
newx <- seq(0,30,by=1)
conf_interval <- predict(bmi.fit, newdata=data.frame(DAYS.DRINKING=newx),
                         interval="confidence",level = 0.95)
lines(newx, conf_interval[,2], col="blue", lty=2)
lines(newx, conf_interval[,3], col="blue", lty=2)

pred_interval <- predict(bmi.fit, newdata=data.frame(DAYS.DRINKING=newx),
                         interval="prediction",level = 0.95)
lines(newx, pred_interval[,2], col="darkorange", lty=2)
lines(newx, pred_interval[,3], col="darkorange", lty=2)

# Here is fancier code that produces a prettier plot
# again, source code can be found here: https://rpubs.com/Bio-Geek/71339

library(ggplot2)

# Plotting 95% Confidence Interval
ggplot(brfssm1d1, aes(x=DAYS.DRINKING, y=BMI))+
  geom_point()+
  geom_smooth(method=lm, se=TRUE)
## Warning: Removed 63 rows containing non-finite values (stat_smooth).
## Warning: Removed 63 rows containing missing values (geom_point).

# Plotting 95% Confidence Interval and Prediction Interval
temp_var <- predict(bmi.fit, interval="prediction")
## Warning in predict.lm(bmi.fit, interval = "prediction"): predictions on current data refer to _future_ responses
brf_data <- brfssm1d1[complete.cases(brfssm1d1$BMI),]
newdata <- cbind(brf_data,temp_var)
ggplot(newdata, aes(DAYS.DRINKING, BMI))+
  geom_point() +
  geom_line(aes(y=lwr), color = "red", linetype = "dashed")+
  geom_line(aes(y=upr), color = "red", linetype = "dashed")+
  geom_smooth(method=lm, se=TRUE)

# End of Program