# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 7   ~
# ~ Simple Linear Regression ~
# ~      Diagnostics         ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Body Fat Data:
# Percentage of body fat, age, weight, height, and ten body circumference 
# measurements (e.g., abdomen, neck, ankle, etc.) are recorded for 252 men. 
# Body fat is estimated through an underwater weighing technique. 

# There are numerous variables in the lead dataset but those of interest for 
# the current exercise include:
# 1.  brobf  : % Body Fat
# 2.  abdmn  : Abdomen Circumference (cm)
# 3.  neck   : Neck Circumference (cm)
# 4.  ankle  : Ankel Circumference (cm)
# 5.  height : Height (in)

## Download and load the data file = bpdata
load(url("http://www.duke.edu/~sgrambow/crp241data/bodyfat.RData"))
# Describing the data

# - Summary statistics 
summary(bodyfat$brobf)      # Reponse/outcome/dependent variable
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.80   19.00   18.94   24.60   45.10
summary(bodyfat$abdmn)    # Covariate/explanatory/predictor/independent variable
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69.40   84.58   90.95   92.56   99.33  148.10
# - Scatter plot to examine co-variation 
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     cex=1.5,pch=19,col='blue')

# Fitting the simple linear regression model 
# Model: % Body Fat ~ Abdomen Circ 

fit <- lm(brobf ~ abdmn,data=bodyfat)
# Nice Summary Output
summary(fit)
## 
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6257  -3.4672   0.0111   3.1415  11.9754 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -35.19661    2.46229  -14.29   <2e-16 ***
## abdmn         0.58489    0.02643   22.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.514 on 250 degrees of freedom
## Multiple R-squared:  0.6621, Adjusted R-squared:  0.6608 
## F-statistic: 489.9 on 1 and 250 DF,  p-value: < 2.2e-16
# Include confidence limits for regression parameters
confint(fit)
##                  2.5 %      97.5 %
## (Intercept) -40.046092 -30.3471234
## abdmn         0.532846   0.6369351
# ANOVA table for Simple Linear Regression
# More in a supplementary video about this
# this can be used to test the fit of the entire 
# model 
summary(aov(fit))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## abdmn         1   9984    9984   489.9 <2e-16 ***
## Residuals   250   5095      20                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot the regression line
# 1. % Body Fat vs. Abdomen Circumference
# Add regression line to scatter plot
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     cex=1.5,pch=19,col='blue')
abline(fit,lty=1,col='red',lwd=2)

# add point 39 to highlight where it is
# possibly a problematic point?
points(148.1,(33.8),cex=3,pch=1,col='red')

# Using the simple linear regression model for ... 

# (1) Estimate the mean % body fat among men whose 
#     abdomen circumference is 100 cm

# Option 1: Do "by hand" 
-35.2 + (100*0.58489) 
## [1] 23.289
# Option 2: Use predict() function 
predict(fit,newdata=data.frame(abdmn=100),interval='conf')
##        fit      lwr      upr
## 1 23.29244 22.61142 23.97347
# (2) Estimate the % body fat for a new observation 
#     whose abdomen circumference is 100 cm

# Option 1: Do "by hand" 
-35.2 + (100*0.58489) 
## [1] 23.289
# Option 2: Use predict() function 
predict(fit,newdata=data.frame(abdmn=100),interval='pred')
##        fit      lwr      upr
## 1 23.29244 14.37532 32.20957
# - Visualization of estimation vs. prediction 
# - Create scatter plot of X vs. Y
plot(x=bodyfat$abdmn,y=bodyfat$brobf,col='blue',pch=19,cex=1.5)
# - Add the fitted regression line (black line)
abline(fit,lwd=2,col='red')

# - Add mean estimate for abdmn (red star)
beta0 <- fit$coefficients[1] # intercept
beta1 <- fit$coefficients[2] # slope
points(100,(beta0 + (100*beta1) ),pch='*',col='red',cex=3)
# - Add confidence bands for all mean estimates 
lines(seq(70,140,by=1),
      predict(fit,newdata=data.frame(abdmn=seq(70,140,by=1)),interval='conf')[,2],
      col='red',lty=2,lwd=2)
lines(seq(70,140,by=1),
      predict(fit,newdata=data.frame(abdmn=seq(70,140,by=1)),interval='conf')[,3],
      col='red',lty=2,lwd=2)

# - Add prediction for abdmn=100 (magenta star)
points(100,(beta0 + (100*beta1) ),pch='*',col='magenta',cex=3)
# - Add confidence bands for all predictions 
lines(seq(70,140,by=1),
      predict(fit,newdata=data.frame(abdmn=seq(70,140,by=1)),interval='pred')[,2],
      col='magenta',lty=2,lwd=2)
lines(seq(70,140,by=1),
      predict(fit,newdata=data.frame(abdmn=seq(70,140,by=1)),interval='pred')[,3],
      col='magenta',lty=2,lwd=2)

# Checking the assumptions of the simple linear regression model 

# Now examine regression diagnostics for this model
# Using  Linear Regression Model above -- fit
#
# single command to produce suggested diagnostic plots
plot(fit)

# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points

# To get all plots on one figure
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1

bodyfat_no39 <- bodyfat[-39,]
# - Use the function lm() to fit the linear regression model
fit_no39 <- lm(brobf~abdmn,data=bodyfat_no39)
summary(fit_no39)
## 
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat_no39)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0655  -3.4059   0.1948   2.8981  11.8462 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -38.60529    2.51098  -15.38   <2e-16 ***
## abdmn         0.62257    0.02703   23.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.366 on 249 degrees of freedom
## Multiple R-squared:  0.6806, Adjusted R-squared:  0.6793 
## F-statistic: 530.5 on 1 and 249 DF,  p-value: < 2.2e-16
confint(fit_no39)
##                   2.5 %      97.5 %
## (Intercept) -43.5507568 -33.6598163
## abdmn         0.5693308   0.6758044
# Add regression line to scatter plot
plot(x=bodyfat_no39$abdmn,y=bodyfat_no39$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     col='blue',pch=19,cex=1.5)
abline(fit,lty=1,col='black',lwd=2)

# what is predicted % body fat for men with 100 cm adbmn 
# for the model without point 39
predict(fit_no39,newdata=data.frame(abdmn=100),interval='conf')
##        fit      lwr      upr
## 1 23.65147 22.97244 24.33051
# what is predicted % body fat for men with 100 cm adbmn 
# for the model with point 39
predict(fit,newdata=data.frame(abdmn=100),interval='conf')
##        fit      lwr      upr
## 1 23.29244 22.61142 23.97347
# use qqplot() function from Car Package
# for a slightly nicer qq plot
# QQ-plots are ubiquitous in statistics. Most people 
# use them in a single, simple way: fit a linear regression 
# model, check if the points lie approximately on the line, 
# and if they don't, your residuals aren't Gaussian and 
# thus your errors aren't either. So standard confidence
# intervals and p-values may be invalid.
# 

library(car)
## Loading required package: carData
qqPlot(fit, main="Q-Q Plot") # from car package

## [1]  39 207
# How is the simple linear regression model really fit?
# just showing what is happening with regression
# not part of actual analysis 
fit <- lm(brobf ~ abdmn,data=bodyfat)
# - Create scatter plot of X vs. Y
plot(x=bodyfat$abdmn,y=bodyfat$brobf,cex=1.5,pch=19,col='blue')
# - Add the fitted regression line (black line)
abline(fit,lwd=2,col='red')
# - Add lines to represent residuals for two data points 
#   (red lines and red stars)
beta0 <- fit$coefficients[1]
beta1 <- fit$coefficients[2]
# highlight point (104.5, 16.9)
points(104.5,(16.9),pch=1,col='red',cex=3)
points(104.5,(beta0+(beta1*104.5)),pch='*',col='red',cex=3)
segments(104.5,16.9,104.5,(beta0+(beta1*104.5)),col='red',lty=2)

# highlight point (122.1,45.1)
points(122.1,(45.1),pch=1,col='red',cex=3)
points(122.1,(beta0+(beta1*122.1)),pch='*',col='red',cex=3)
segments(122.1,45.1,122.1,(beta0+(beta1*122.1)),col='red',lty=2)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~      Sample showing regression diagnostics             ~
# ~      for SLR with curvilinear data pattern            ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Author -- Megan Neely, PhD (with some modifications from Steve)

# Download and load the linearity dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/linearity.RData",
              destfile = "linearity.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("linearity.RData")

# Plot the data to see the relationship between x and y: 
# - The dots represent the observed data, which contain noise (i.e. error)
#   we are trying to estimate using the data (i.e. via regression)
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y',
      xlab='X',
      ylab='Y')

# What happens if we "fit a line" to the data to estimate the relationship 
# between x and y?
# - That is, we fit a linear regression line and assume the relationship 
#   between x and y is linear?
# (1) Fit the regression line
#     - NOTE: p-value for x is greater than 0.05; says there is no relationship 
#             between x and y, but we know that is not true from looking at the 
#             data. Happens because this analysis ignored the non-linear 
#             relationship between x and y. 
#     - the estimated slope is -0.07 and associated p-value is 0.15
fit <- lm(y~x,data=linearity) 
summary(fit)
## 
## Call:
## lm(formula = y ~ x, data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4729 -1.8603  0.1531  1.9353  4.5427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.19872    0.89892   14.68 3.09e-15 ***
## x           -0.07083    0.04754   -1.49    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared:  0.06889,    Adjusted R-squared:  0.03785 
## F-statistic:  2.22 on 1 and 30 DF,  p-value: 0.1467
# (2) Plot the fitted regression line (blue) and again the data (dots) 
#     
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y with 
      Simple Linear Regression Line', xlab='X', ylab='Y')
abline(fit,col='blue',lty=2, lwd=2)

# check the diagnostics
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# You can see a clear curvilinear pattern in the Residuals vs Fitted plot suggesting
# a problem with linearity.
# End of Program




# End Program