# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 2 ~
# ~ Regression Diagnostics ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
##################################################################
# Muscle Contraction in Alcoholics:
# This dataset contains maximum voluntary contraction (MVC) of quadriceps muscle, age,
# and height, of 41 male alcoholics. Researchers were interested in investigating the
# effect of both age and height on mean MVC.
# Data Set: mve
# Data Dictionary:
# (1) age age (years)
# (2) height height (cm)
# (3) mvc Max voluntary contraction, quadriceps muscle (newtons)
# Download and load the data files
download.file("http://www.duke.edu/~sgrambow/crp245data/mvc.RData",
destfile = "mvc.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("mvc.RData")
############################################
# Summary and Visualization of Muscle Data
############################################
# summary for Y -- mvc
summary(muscle$mvc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.0 270.0 343.0 322.1 402.0 540.0
sd(muscle$mvc)
## [1] 112.1767
hist(muscle$mvc)

# summary for X1 -- age -- in years
summary(muscle$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 34.00 41.00 43.54 53.00 65.00
sd(muscle$age) # standard deviation
## [1] 11.32717
hist(muscle$age)

plot(muscle$age,muscle$mvc) # scatterplot mvc vs age
abline(lm(mvc ~ age, data=muscle)) # simple regression line

# summary for X2 -- height -- height in cm
summary(muscle$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 155.0 167.0 172.0 170.7 175.0 180.0
sd(muscle$height) # standard deviation
## [1] 6.53079
hist(muscle$height)

plot(muscle$height,muscle$mvc) # scatterplot mvc vs height
abline(lm(mvc ~ height, data=muscle)) # simple regression line

# scatterplot matrix for mvc data
pairs(~mvc+age+height,main="Simple Scatterplot Matrix",lower.panel=NULL, data=muscle)

# Alternate Scatterplot Matrix from the car Package
# install.packages("car")
library(car)
## Loading required package: carData
scatterplotMatrix(~mvc+age+height, data=muscle, diagonal=list(method ="histogram"),
main="Muscle Scatterplots")

# simple Linear Regression with AGE
sfit.age <- lm(mvc ~ age, data=muscle)
summary(sfit.age)
##
## Call:
## lm(formula = mvc ~ age, data = muscle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.88 -67.05 10.41 64.31 207.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 501.858 64.794 7.745 2.08e-09 ***
## age -4.128 1.441 -2.864 0.0067 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.3 on 39 degrees of freedom
## Multiple R-squared: 0.1738, Adjusted R-squared: 0.1526
## F-statistic: 8.203 on 1 and 39 DF, p-value: 0.006701
# simple Linear Regression with HEIGHT
sfit.height <- lm(mvc ~ height, data=muscle)
summary(sfit.height)
##
## Call:
## lm(formula = mvc ~ height, data = muscle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.446 -43.229 0.729 49.134 195.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -907.626 426.612 -2.128 0.03975 *
## height 7.203 2.497 2.885 0.00635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 39 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1547
## F-statistic: 8.321 on 1 and 39 DF, p-value: 0.006351
# Multiple Linear Regression
# Model: mvc (Y) ~ age (X1) + height (X2)
mfit <- lm(mvc ~ age + height, data=muscle)
summary(mfit)
##
## Call:
## lm(formula = mvc ~ age + height, data = muscle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.523 -33.483 5.665 50.917 170.841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -465.626 460.333 -1.011 0.3182
## age -3.075 1.467 -2.096 0.0428 *
## height 5.398 2.545 2.121 0.0405 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.92 on 38 degrees of freedom
## Multiple R-squared: 0.2612, Adjusted R-squared: 0.2224
## F-statistic: 6.719 on 2 and 38 DF, p-value: 0.003173
# Now examine regression diagnostics for this model
# Using Adjusted Linear Regression Model above -- mfit
#
# single command to produce suggested diagnostic plots
plot(mfit)




# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points
# To get all plots on one figure
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(mfit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# Alternate QQ Plot from the car Package
qqPlot(mfit$residuals, main="Q-Q Plot")

## [1] 5 8
# FEV1 and Genetic Variation:
# A study was conducted to determine if there was an association between a genetic
# variant and forced expiratory volume in one second (FEV1) in patients with COPD.
# FEV1 was measured in liters. Genotypes of interest were wild type vs. mutant
# (i.e. heterozygous or homozygous for risk allele). Sex was also recorded.
# 100 patients were randomly selected from the researchers clinical practice.
# Data Dictionary:
# 1. FEV1 forced expiratory volume in one second (liters)
# 2. GENO patient genotyps (1 = Mutant; 0 = Wild Type)
# 3. SEX patient sex (1 = Female; 0 = Male)
# Download and load the FEV1 dataset used in lecture:
download.file("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData",
destfile = "fev1_geno.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("fev1_geno.RData")
# scatterplot matrix for mvc data
pairs(~FEV1+GENO+SEX,main="Simple Scatterplot Matrix",data=fgdata)

# Alternate Scatterplot Matrix from the car Package
library(car)
scatterplotMatrix(~FEV1+GENO+SEX, data=fgdata,
main="FEV1 Scatterplots")

# Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35865 -0.68975 -0.02921 0.59003 2.63869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.64134 0.09888 36.824 < 2e-16 ***
## GENO -0.41683 0.14907 -2.796 0.00568 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared: 0.03799, Adjusted R-squared: 0.03313
## F-statistic: 7.819 on 1 and 198 DF, p-value: 0.005681
# Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
##
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23691 -0.69417 -0.01816 0.77985 2.36431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9157 0.1081 36.213 < 2e-16 ***
## GENO -0.2090 0.1467 -1.425 0.156
## SEX -0.7317 0.1456 -5.025 1.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared: 0.1473, Adjusted R-squared: 0.1386
## F-statistic: 17.02 on 2 and 197 DF, p-value: 1.526e-07
# Now examine regression diagnostics for this model
# Uisng Adjusted Linear Regression Model above -- afit
#
# single command to produce suggested diagnostic plots
plot(afit)




# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points
# To get all plots on one figure
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(afit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# Alternate QQ Plot from the car Package
qqPlot(afit$residuals, main="Q-Q Plot")

## [1] 19 169
# Example -- Cigarette Data -- Influectional Points and Multicollinearity
# The Federal Trade Commission annually rates varieties of domestic
# cigarettes according to their tar, nicotine, and carbon monoxide
# content. The United States Surgeon General considers each of these
# substances hazardous to a smoker's health. Past studies have shown
# that increases in the tar and nicotine content of a cigarette are
# accompanied by an increase in the carbon monoxide emitted from the
# cigarette smoke.
# The data presented here are taken from Mendenhall and Sincich (1992)
# and are a subset of the data produced by the Federal Trade Commission.
# Data Dictionary:
# 1. Brand Brand of cigarette (25 brands)
# 2. Tar Tar content (mg)
# 3. Nicotine Nicotine content (mg)
# 4. Weight Weight (g)
# 5. CO Carbon monoxide content (mg)
# Download and load the cigarette dataset used in lecture:
download.file("http://www.duke.edu/~sgrambow/crp241data/cigarette.RData",
destfile = "cigarette.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("cigarette.RData")
# display summary statistics for dataset
str(cigarette)
## 'data.frame': 25 obs. of 5 variables:
## $ Brand : Factor w/ 25 levels "Alpine","Benson&Hedges",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Tar : num 14.1 16 29.8 8 4.1 15 8.8 12.4 16.6 14.9 ...
## $ Nicotine: num 0.86 1.06 2.03 0.67 0.4 1.04 0.76 0.95 1.12 1.02 ...
## $ Weight : num 0.985 1.094 1.165 0.928 0.946 ...
## $ CO : num 13.6 16.6 23.5 10.2 5.4 15 9 12.3 16.3 15.4 ...
head(cigarette)
## Brand Tar Nicotine Weight CO
## 1 Alpine 14.1 0.86 0.9853 13.6
## 2 Benson&Hedges 16.0 1.06 1.0938 16.6
## 3 BullDurham 29.8 2.03 1.1650 23.5
## 4 CamelLights 8.0 0.67 0.9280 10.2
## 5 Carlton 4.1 0.40 0.9462 5.4
## 6 Chesterfield 15.0 1.04 0.8885 15.0
summary(cigarette)
## Brand Tar Nicotine Weight
## Alpine : 1 Min. : 1.00 Min. :0.1300 Min. :0.7851
## Benson&Hedges: 1 1st Qu.: 8.60 1st Qu.:0.6900 1st Qu.:0.9225
## BullDurham : 1 Median :12.80 Median :0.9000 Median :0.9573
## CamelLights : 1 Mean :12.22 Mean :0.8764 Mean :0.9703
## Carlton : 1 3rd Qu.:15.10 3rd Qu.:1.0200 3rd Qu.:1.0070
## Chesterfield : 1 Max. :29.80 Max. :2.0300 Max. :1.1650
## (Other) :19
## CO
## Min. : 1.50
## 1st Qu.:10.00
## Median :13.00
## Mean :12.53
## 3rd Qu.:15.40
## Max. :23.50
##
# scatterplot matrix for cigarette data
pairs(~CO+Tar+Nicotine,main="Simple Scatterplot Matrix",data=cigarette)

scatterplotMatrix(~CO+Tar+Nicotine, data=cigarette, diagonal="histogram",
main="Cigarette Scatterplots")
## Warning in applyDefaults(diagonal, defaults = list(method =
## "adaptiveDensity"), : unnamed diag arguments, will be ignored

# Correlations
# Calculate Pearson correlations between each pair of variables
cor(cigarette$CO,cigarette$Tar,method="pearson")
## [1] 0.9574853
cor(cigarette$CO,cigarette$Nicotine,method="pearson")
## [1] 0.9259473
cor(cigarette$Tar,cigarette$Nicotine,method="pearson")
## [1] 0.9766076
# Simple Linear Regression with Tar
fit.CO.Tar <- lm(CO ~ Tar,data=cigarette)
summary(fit.CO.Tar)
##
## Call:
## lm(formula = CO ~ Tar, data = cigarette)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1124 -0.7167 -0.3754 1.0091 2.5450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.74328 0.67521 4.063 0.000481 ***
## Tar 0.80098 0.05032 15.918 6.55e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.397 on 23 degrees of freedom
## Multiple R-squared: 0.9168, Adjusted R-squared: 0.9132
## F-statistic: 253.4 on 1 and 23 DF, p-value: 6.552e-14
# Simple Linear Regression with Nicotine
fit.CO.NIC <- lm(CO ~ Nicotine,data=cigarette)
summary(fit.CO.NIC, data=cigarette)
##
## Call:
## lm(formula = CO ~ Nicotine, data = cigarette)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3273 -1.2228 0.2304 1.2700 3.9357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6647 0.9936 1.675 0.107
## Nicotine 12.3954 1.0542 11.759 3.31e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.828 on 23 degrees of freedom
## Multiple R-squared: 0.8574, Adjusted R-squared: 0.8512
## F-statistic: 138.3 on 1 and 23 DF, p-value: 3.312e-11
# Multiple Linear Regression with Tar & Nicotine
mfit.CO.TAR.NIC <- lm(CO~ Tar+Nicotine,data=cigarette)
summary(mfit.CO.TAR.NIC)
##
## Call:
## lm(formula = CO ~ Tar + Nicotine, data = cigarette)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.89941 -0.78470 -0.00144 0.91585 2.43064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0896 0.8438 3.662 0.001371 **
## Tar 0.9625 0.2367 4.067 0.000512 ***
## Nicotine -2.6463 3.7872 -0.699 0.492035
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.413 on 22 degrees of freedom
## Multiple R-squared: 0.9186, Adjusted R-squared: 0.9112
## F-statistic: 124.1 on 2 and 22 DF, p-value: 1.042e-12
# Now examine regression diagnostics for this model
# Uisng Adjusted Linear Regression Model above -- mfit.CO.TAR.NIC
#
# single command to produce suggested diagnostic plots
plot(mfit.CO.TAR.NIC)




# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points
# remove outlier identified in leverage plot data point 3
cigarette_reduced <- cigarette[-3,]
# Multiple Linear Regression with Tar & Nicotine minus leverage point
mfit.CO.TAR.NIC.Reduced <- lm(CO~ Tar+Nicotine,data=cigarette_reduced)
summary(mfit.CO.TAR.NIC.Reduced)
##
## Call:
## lm(formula = CO ~ Tar + Nicotine, data = cigarette_reduced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7661 -0.7877 -0.3204 1.0654 2.3736
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3089 0.8483 1.543 0.137795
## Tar 0.8918 0.1927 4.628 0.000145 ***
## Nicotine 0.6289 3.2034 0.196 0.846235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.144 on 21 degrees of freedom
## Multiple R-squared: 0.9336, Adjusted R-squared: 0.9273
## F-statistic: 147.6 on 2 and 21 DF, p-value: 4.304e-13
# check diagnostic plot again
plot(mfit.CO.TAR.NIC.Reduced)




# Need car package for multiollinearity function vif()
#install.packages('car')
library(car)
# Evaluate multicollinearity
vif(mfit.CO.TAR.NIC.Reduced) # variance inflation factors
## Tar Nicotine
## 12.72247 12.72247
sqrt(vif(mfit.CO.TAR.NIC.Reduced)) > 2 # problem?
## Tar Nicotine
## TRUE TRUE
# Remove one of the collinear variables to fix this issue!
# End of Program