# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 6-7 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Body Fat Data:
# Percentage of body fat, age, weight, height, and ten body circumference 
# measurements (e.g., abdomen, neck, ankle, etc.) are recorded for 252 men. 
# Body fat is estimated through an underwater weighing technique. 

# There are numerous variables in the lead dataset but those of interest for 
# the current exercise include:
# 1.  brobf  : % Body Fat
# 2.  abdmn  : Abdomen Circumference (cm)
# 3.  neck   : Neck Circumference (cm)
# 4.  ankle  : Ankle Circumference (cm)
# 5.  height : Height (in)

## Download and load the data file = bpdata
download.file("http://www.duke.edu/~sgrambow/crp241data/bodyfat.RData",
              destfile = "bodyfat.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("bodyfat.RData")

## Scatter Plots: 
# 1. % Body Fat vs. Abdomen Circumference
# This is one of the scatterplots on Slide 9
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     cex=1,pch=0,col='sienna3')

# 2. Neck Circumference vs. Ankle Circumference
# This is one of the scatterplots on Slide 9
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$neck,y=bodyfat$ankle,
     main='Scatter Plot of Neck vs. Ankle Circ', 
     ylab='Neck Circumference (cm)',
     xlab='Ankle Circumference (cm)',
     cex=1,pch=1,col='sienna3')

# 3. % Body Fat vs. Height
# This is one of the scatterplots on Slide 10
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$height,y=bodyfat$brobf,
     main='Scatter Plot of % Body Fat vs. Height', 
     ylab='% Body Fat',
     xlab='Height (in)',
     cex=1,pch=2,col='sienna3')

# removing the single outlier
# changes the impression of the scatterplot dramatically
bsub <- subset(bodyfat,bodyfat$height > 35)
plot(x=bsub$height,y=bsub$brobf,
     main='Scatter Plot of % Body Fat vs. Height', 
     ylab='% Body Fat',
     xlab='Height (in)',
     cex=1,pch=2,col='sienna3')

cor(bsub$brobf,bsub$height)
## [1] -0.02261395
# Basic Scatterplot Matrix
# demonstrated with bpdata
# See slide 10 of the Module 4 Day 6-7 slides
pairs(~bodyfat$brobf + bodyfat$abdm + bodyfat$ankle + bodyfat$height,data=bodyfat,
      col='blue',main="Simple Scatterplot Matrix")

# Scatterplot Matrices from the car Package
# there are numerous options to customize this plot
# see function help for details
# See slide 14 of the Module 4 Day 6-7 slides
# You made need to first uncomment the line below and install the car package
# install.packages('car')
library(car)
## Loading required package: carData
scatterplotMatrix(~bodyfat$brobf + bodyfat$abdmn + bodyfat$ankle + bodyfat$height, 
                  data=bodyfat,main="Scatterplot Matrix bodyfat",smooth=FALSE,regLine=FALSE,
                  diagonal=list(method="histogram"))

## Correlation Coefficients and Tests of Ho: Rho = 0: 
# See slide 18 of the Module 4 Day 6-7 slides
# 1. % Body Fat vs. Abdomen Circumference
cor.test(x=bodyfat$abdmn,y=bodyfat$brobf,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  bodyfat$abdmn and bodyfat$brobf
## t = 22.134, df = 250, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7672855 0.8516445
## sample estimates:
##       cor 
## 0.8137062
# 2. Neck Circumference vs. Ankle Circumference
cor.test(x=bodyfat$neck,y=bodyfat$ankle,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  bodyfat$neck and bodyfat$ankle
## t = 8.602, df = 250, p-value = 8.768e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3765570 0.5679266
## sample estimates:
##       cor 
## 0.4778924
# 3. % Body Fat vs. Height
cor.test(x=bodyfat$height,y=bodyfat$brobf,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  bodyfat$height and bodyfat$brobf
## t = -1.4145, df = 250, p-value = 0.1585
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.21036293  0.03485018
## sample estimates:
##         cor 
## -0.08910641
## Simple Linear Regression Model: 
# Model: % Body Fat ~ Abdomen Circ 
# - Use the function lm() to fit the linear regression model
fit1 <- lm(brobf~abdmn,data=bodyfat)
summary(fit1)
## 
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6257  -3.4672   0.0111   3.1415  11.9754 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -35.19661    2.46229  -14.29   <2e-16 ***
## abdmn         0.58489    0.02643   22.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.514 on 250 degrees of freedom
## Multiple R-squared:  0.6621, Adjusted R-squared:  0.6608 
## F-statistic: 489.9 on 1 and 250 DF,  p-value: < 2.2e-16
confint(fit1)
##                  2.5 %      97.5 %
## (Intercept) -40.046092 -30.3471234
## abdmn         0.532846   0.6369351
# Add regression line to scatter plot
# See slide 51 of the Module 4 Day 6-7 slides
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     cex=1,pch=0,col='sienna3')
abline(fit1,lty=2,col='black',lwd=2)

# Checking the assumptions of the simple linear regression model 

# Now examine regression diagnostics for this model
# Using  Linear Regression Model above 
#
# single command to produce suggested diagnostic plots
# See slides 86-87 of the Module 4 Day 6-7 slides
plot(fit1)

# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points

# To get all plots on one figure
# See slide 87 of the Module 4 Day 6-7 slides
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit1)

par(mfrow=c(1,1)) # Change back to 1 x 1

bodyfat_no39 <- bodyfat[-39,]
# - Use the function lm() to fit the linear regression model
fit_no39 <- lm(brobf~abdmn,data=bodyfat_no39)
summary(fit_no39)
## 
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat_no39)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0655  -3.4059   0.1948   2.8981  11.8462 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -38.60529    2.51098  -15.38   <2e-16 ***
## abdmn         0.62257    0.02703   23.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.366 on 249 degrees of freedom
## Multiple R-squared:  0.6806, Adjusted R-squared:  0.6793 
## F-statistic: 530.5 on 1 and 249 DF,  p-value: < 2.2e-16
confint(fit_no39)
##                   2.5 %      97.5 %
## (Intercept) -43.5507568 -33.6598163
## abdmn         0.5693308   0.6758044
# Add regression line to scatter plot
plot(x=bodyfat_no39$abdmn,y=bodyfat_no39$brobf,
     main='Scatter Plot of % Body Fat vs. Abdomen Circ', 
     ylab='% Body Fat',
     xlab='Abdomen Circumference (cm)',
     col='blue',pch=19,cex=1.5)
abline(fit_no39,lty=1,col='black',lwd=2)

# what is predicted % body fat for men with 100 cm adbmn 
# for the model without point 39
predict(fit_no39,newdata=data.frame(abdmn=100),interval='conf')
##        fit      lwr      upr
## 1 23.65147 22.97244 24.33051
# what is predicted % body fat for men with 100 cm adbmn 
# for the model with point 39
predict(fit1,newdata=data.frame(abdmn=100),interval='conf')
##        fit      lwr      upr
## 1 23.29244 22.61142 23.97347
# use qqplot() function from Car Package
# for a slightly nicer qq plot
# See slide 92 of the Module 4 Day 6-7 slides
library(car)
qqPlot(fit1, main="Q-Q Plot") # from car package

## [1]  39 207
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 7 Linearity Example ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~      Sample showing regression diagnostics             ~
# ~      for SLR with curvilinear data pattern            ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Author -- Megan Neely, PhD (with some modifications from Steve)

# Download and load the linearity dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/linearity.RData",
              destfile = "linearity.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("linearity.RData")

# Plot the data to see the relationship between x and y: 
# - The dots represent the observed data, which contain noise (i.e. error)
#   we are trying to estimate using the data (i.e. via regression)
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y',
      xlab='X',
      ylab='Y')

# What happens if we "fit a line" to the data to estimate the relationship 
# between x and y?
# - That is, we fit a linear regression line and assume the relationship 
#   between x and y is linear?
# (1) Fit the regression line
#     - NOTE: p-value for x is greater than 0.05; says there is no relationship 
#             between x and y, but we know that is not true from looking at the 
#             data. Happens because this analysis ignored the non-linear 
#             relationship between x and y. 
#     - the estimated slope is -0.07 and associated p-value is 0.15
fit <- lm(y~x,data=linearity) 
summary(fit)
## 
## Call:
## lm(formula = y ~ x, data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4729 -1.8603  0.1531  1.9353  4.5427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.19872    0.89892   14.68 3.09e-15 ***
## x           -0.07083    0.04754   -1.49    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared:  0.06889,    Adjusted R-squared:  0.03785 
## F-statistic:  2.22 on 1 and 30 DF,  p-value: 0.1467
# (2) Plot the fitted regression line (blue) and again the data (dots) 
#     
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y with 
      Simple Linear Regression Line', xlab='X', ylab='Y')
abline(fit,col='blue',lty=2, lwd=2)

# check the diagnostics
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# You can see a clear curvilinear pattern in the Residuals vs Fitted plot suggesting
# a problem with linearity.
# End of Program