# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 6-7 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Body Fat Data:
# Percentage of body fat, age, weight, height, and ten body circumference
# measurements (e.g., abdomen, neck, ankle, etc.) are recorded for 252 men.
# Body fat is estimated through an underwater weighing technique.
# There are numerous variables in the lead dataset but those of interest for
# the current exercise include:
# 1. brobf : % Body Fat
# 2. abdmn : Abdomen Circumference (cm)
# 3. neck : Neck Circumference (cm)
# 4. ankle : Ankle Circumference (cm)
# 5. height : Height (in)
## Download and load the data file = bpdata
download.file("http://www.duke.edu/~sgrambow/crp241data/bodyfat.RData",
destfile = "bodyfat.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("bodyfat.RData")
## Scatter Plots:
# 1. % Body Fat vs. Abdomen Circumference
# This is one of the scatterplots on Slide 9
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
main='Scatter Plot of % Body Fat vs. Abdomen Circ',
ylab='% Body Fat',
xlab='Abdomen Circumference (cm)',
cex=1,pch=0,col='sienna3')

# 2. Neck Circumference vs. Ankle Circumference
# This is one of the scatterplots on Slide 9
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$neck,y=bodyfat$ankle,
main='Scatter Plot of Neck vs. Ankle Circ',
ylab='Neck Circumference (cm)',
xlab='Ankle Circumference (cm)',
cex=1,pch=1,col='sienna3')

# 3. % Body Fat vs. Height
# This is one of the scatterplots on Slide 10
# of the Module 4 Day 6-7 slides
plot(x=bodyfat$height,y=bodyfat$brobf,
main='Scatter Plot of % Body Fat vs. Height',
ylab='% Body Fat',
xlab='Height (in)',
cex=1,pch=2,col='sienna3')

# removing the single outlier
# changes the impression of the scatterplot dramatically
bsub <- subset(bodyfat,bodyfat$height > 35)
plot(x=bsub$height,y=bsub$brobf,
main='Scatter Plot of % Body Fat vs. Height',
ylab='% Body Fat',
xlab='Height (in)',
cex=1,pch=2,col='sienna3')

cor(bsub$brobf,bsub$height)
## [1] -0.02261395
# Basic Scatterplot Matrix
# demonstrated with bpdata
# See slide 10 of the Module 4 Day 6-7 slides
pairs(~bodyfat$brobf + bodyfat$abdm + bodyfat$ankle + bodyfat$height,data=bodyfat,
col='blue',main="Simple Scatterplot Matrix")

# Scatterplot Matrices from the car Package
# there are numerous options to customize this plot
# see function help for details
# See slide 14 of the Module 4 Day 6-7 slides
# You made need to first uncomment the line below and install the car package
# install.packages('car')
library(car)
## Loading required package: carData
scatterplotMatrix(~bodyfat$brobf + bodyfat$abdmn + bodyfat$ankle + bodyfat$height,
data=bodyfat,main="Scatterplot Matrix bodyfat",smooth=FALSE,regLine=FALSE,
diagonal=list(method="histogram"))

## Correlation Coefficients and Tests of Ho: Rho = 0:
# See slide 18 of the Module 4 Day 6-7 slides
# 1. % Body Fat vs. Abdomen Circumference
cor.test(x=bodyfat$abdmn,y=bodyfat$brobf,method='pearson')
##
## Pearson's product-moment correlation
##
## data: bodyfat$abdmn and bodyfat$brobf
## t = 22.134, df = 250, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7672855 0.8516445
## sample estimates:
## cor
## 0.8137062
# 2. Neck Circumference vs. Ankle Circumference
cor.test(x=bodyfat$neck,y=bodyfat$ankle,method='pearson')
##
## Pearson's product-moment correlation
##
## data: bodyfat$neck and bodyfat$ankle
## t = 8.602, df = 250, p-value = 8.768e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3765570 0.5679266
## sample estimates:
## cor
## 0.4778924
# 3. % Body Fat vs. Height
cor.test(x=bodyfat$height,y=bodyfat$brobf,method='pearson')
##
## Pearson's product-moment correlation
##
## data: bodyfat$height and bodyfat$brobf
## t = -1.4145, df = 250, p-value = 0.1585
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.21036293 0.03485018
## sample estimates:
## cor
## -0.08910641
## Simple Linear Regression Model:
# Model: % Body Fat ~ Abdomen Circ
# - Use the function lm() to fit the linear regression model
fit1 <- lm(brobf~abdmn,data=bodyfat)
summary(fit1)
##
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.6257 -3.4672 0.0111 3.1415 11.9754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.19661 2.46229 -14.29 <2e-16 ***
## abdmn 0.58489 0.02643 22.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.514 on 250 degrees of freedom
## Multiple R-squared: 0.6621, Adjusted R-squared: 0.6608
## F-statistic: 489.9 on 1 and 250 DF, p-value: < 2.2e-16
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) -40.046092 -30.3471234
## abdmn 0.532846 0.6369351
# Add regression line to scatter plot
# See slide 51 of the Module 4 Day 6-7 slides
plot(x=bodyfat$abdmn,y=bodyfat$brobf,
main='Scatter Plot of % Body Fat vs. Abdomen Circ',
ylab='% Body Fat',
xlab='Abdomen Circumference (cm)',
cex=1,pch=0,col='sienna3')
abline(fit1,lty=2,col='black',lwd=2)

# Checking the assumptions of the simple linear regression model
# Now examine regression diagnostics for this model
# Using Linear Regression Model above
#
# single command to produce suggested diagnostic plots
# See slides 86-87 of the Module 4 Day 6-7 slides
plot(fit1)




# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points
# To get all plots on one figure
# See slide 87 of the Module 4 Day 6-7 slides
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit1)

par(mfrow=c(1,1)) # Change back to 1 x 1
bodyfat_no39 <- bodyfat[-39,]
# - Use the function lm() to fit the linear regression model
fit_no39 <- lm(brobf~abdmn,data=bodyfat_no39)
summary(fit_no39)
##
## Call:
## lm(formula = brobf ~ abdmn, data = bodyfat_no39)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0655 -3.4059 0.1948 2.8981 11.8462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -38.60529 2.51098 -15.38 <2e-16 ***
## abdmn 0.62257 0.02703 23.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.366 on 249 degrees of freedom
## Multiple R-squared: 0.6806, Adjusted R-squared: 0.6793
## F-statistic: 530.5 on 1 and 249 DF, p-value: < 2.2e-16
confint(fit_no39)
## 2.5 % 97.5 %
## (Intercept) -43.5507568 -33.6598163
## abdmn 0.5693308 0.6758044
# Add regression line to scatter plot
plot(x=bodyfat_no39$abdmn,y=bodyfat_no39$brobf,
main='Scatter Plot of % Body Fat vs. Abdomen Circ',
ylab='% Body Fat',
xlab='Abdomen Circumference (cm)',
col='blue',pch=19,cex=1.5)
abline(fit_no39,lty=1,col='black',lwd=2)

# what is predicted % body fat for men with 100 cm adbmn
# for the model without point 39
predict(fit_no39,newdata=data.frame(abdmn=100),interval='conf')
## fit lwr upr
## 1 23.65147 22.97244 24.33051
# what is predicted % body fat for men with 100 cm adbmn
# for the model with point 39
predict(fit1,newdata=data.frame(abdmn=100),interval='conf')
## fit lwr upr
## 1 23.29244 22.61142 23.97347
# use qqplot() function from Car Package
# for a slightly nicer qq plot
# See slide 92 of the Module 4 Day 6-7 slides
library(car)
qqPlot(fit1, main="Q-Q Plot") # from car package

## [1] 39 207
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 7 Linearity Example ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ Sample showing regression diagnostics ~
# ~ for SLR with curvilinear data pattern ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Author -- Megan Neely, PhD (with some modifications from Steve)
# Download and load the linearity dataset:
download.file("http://www.duke.edu/~sgrambow/crp241data/linearity.RData",
destfile = "linearity.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("linearity.RData")
# Plot the data to see the relationship between x and y:
# - The dots represent the observed data, which contain noise (i.e. error)
# we are trying to estimate using the data (i.e. via regression)
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y',
xlab='X',
ylab='Y')

# What happens if we "fit a line" to the data to estimate the relationship
# between x and y?
# - That is, we fit a linear regression line and assume the relationship
# between x and y is linear?
# (1) Fit the regression line
# - NOTE: p-value for x is greater than 0.05; says there is no relationship
# between x and y, but we know that is not true from looking at the
# data. Happens because this analysis ignored the non-linear
# relationship between x and y.
# - the estimated slope is -0.07 and associated p-value is 0.15
fit <- lm(y~x,data=linearity)
summary(fit)
##
## Call:
## lm(formula = y ~ x, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4729 -1.8603 0.1531 1.9353 4.5427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.19872 0.89892 14.68 3.09e-15 ***
## x -0.07083 0.04754 -1.49 0.147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared: 0.06889, Adjusted R-squared: 0.03785
## F-statistic: 2.22 on 1 and 30 DF, p-value: 0.1467
# (2) Plot the fitted regression line (blue) and again the data (dots)
#
plot( linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y with
Simple Linear Regression Line', xlab='X', ylab='Y')
abline(fit,col='blue',lty=2, lwd=2)

# check the diagnostics
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# You can see a clear curvilinear pattern in the Residuals vs Fitted plot suggesting
# a problem with linearity.
# End of Program