# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 2 ~
# ~ Regression Diagnostics ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

##################################################################
# Muscle Contraction in Alcoholics: 
# This dataset contains maximum voluntary contraction (MVC) of quadriceps muscle, age, 
# and height, of 41 male alcoholics. Researchers were interested in investigating the 
# effect of both age and height on mean MVC.

# Data Set: mve

# Data Dictionary: 
# (1) age      age (years) 
# (2) height   height (cm)
# (3) mvc      Max voluntary contraction, quadriceps muscle (newtons)

# Download and load the data files 
download.file("http://www.duke.edu/~sgrambow/crp245data/mvc.RData",
              destfile = "mvc.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("mvc.RData")
############################################
# Summary and Visualization of Muscle Data
############################################

# summary for Y -- mvc
summary(muscle$mvc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    74.0   270.0   343.0   322.1   402.0   540.0
sd(muscle$mvc)
## [1] 112.1767
hist(muscle$mvc)

# summary for X1 -- age -- in years
summary(muscle$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   34.00   41.00   43.54   53.00   65.00
sd(muscle$age) # standard deviation
## [1] 11.32717
hist(muscle$age)

plot(muscle$age,muscle$mvc) # scatterplot mvc vs age
abline(lm(mvc ~ age, data=muscle)) # simple regression line

# summary for X2 -- height -- height in cm
summary(muscle$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   155.0   167.0   172.0   170.7   175.0   180.0
sd(muscle$height) # standard deviation
## [1] 6.53079
hist(muscle$height) 

plot(muscle$height,muscle$mvc) # scatterplot mvc vs height
abline(lm(mvc ~ height, data=muscle)) # simple regression line

# scatterplot matrix for mvc data
pairs(~mvc+age+height,main="Simple Scatterplot Matrix",lower.panel=NULL, data=muscle)

# Alternate Scatterplot Matrix from the car Package
# install.packages("car")
library(car)
## Loading required package: carData
scatterplotMatrix(~mvc+age+height, data=muscle, diagonal=list(method ="histogram"),
                  main="Muscle Scatterplots")

# simple Linear Regression with AGE
sfit.age <- lm(mvc ~ age, data=muscle)
summary(sfit.age)
## 
## Call:
## lm(formula = mvc ~ age, data = muscle)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -226.88  -67.05   10.41   64.31  207.41 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  501.858     64.794   7.745 2.08e-09 ***
## age           -4.128      1.441  -2.864   0.0067 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.3 on 39 degrees of freedom
## Multiple R-squared:  0.1738, Adjusted R-squared:  0.1526 
## F-statistic: 8.203 on 1 and 39 DF,  p-value: 0.006701
# simple Linear Regression with HEIGHT
sfit.height <- lm(mvc ~ height, data=muscle)
summary(sfit.height)
## 
## Call:
## lm(formula = mvc ~ height, data = muscle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -228.446  -43.229    0.729   49.134  195.757 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -907.626    426.612  -2.128  0.03975 * 
## height         7.203      2.497   2.885  0.00635 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.1 on 39 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1547 
## F-statistic: 8.321 on 1 and 39 DF,  p-value: 0.006351
# Multiple Linear Regression
# Model:  mvc (Y) ~ age (X1) + height (X2)
mfit <- lm(mvc ~ age + height, data=muscle)
summary(mfit)
## 
## Call:
## lm(formula = mvc ~ age + height, data = muscle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -220.523  -33.483    5.665   50.917  170.841 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -465.626    460.333  -1.011   0.3182  
## age           -3.075      1.467  -2.096   0.0428 *
## height         5.398      2.545   2.121   0.0405 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98.92 on 38 degrees of freedom
## Multiple R-squared:  0.2612, Adjusted R-squared:  0.2224 
## F-statistic: 6.719 on 2 and 38 DF,  p-value: 0.003173
# Now examine regression diagnostics for this model
# Using Adjusted Linear Regression Model above -- mfit
#
# single command to produce suggested diagnostic plots
plot(mfit)

# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points

# To get all plots on one figure
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(mfit)

par(mfrow=c(1,1)) # Change back to 1 x 1


# Alternate QQ Plot from the car Package
qqPlot(mfit$residuals, main="Q-Q Plot")

## [1] 5 8
# FEV1 and Genetic Variation: 
# A study was conducted to determine if there was an association between a genetic 
# variant and forced expiratory volume in one second (FEV1) in patients with COPD. 
# FEV1 was measured in liters. Genotypes of interest were wild type vs. mutant 
# (i.e. heterozygous or homozygous for risk allele). Sex was also recorded. 
# 100 patients were randomly selected from the researchers clinical practice. 

# Data Dictionary: 
# 1.  FEV1      forced expiratory volume in one second (liters)
# 2.  GENO      patient genotyps (1 = Mutant; 0 = Wild Type)
# 3.    SEX       patient sex (1 = Female; 0 = Male)

# Download and load the FEV1 dataset used in lecture: 
download.file("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData",
              destfile = "fev1_geno.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("fev1_geno.RData")

# scatterplot matrix for mvc data
pairs(~FEV1+GENO+SEX,main="Simple Scatterplot Matrix",data=fgdata)

# Alternate Scatterplot Matrix from the car Package
library(car)
scatterplotMatrix(~FEV1+GENO+SEX, data=fgdata,
                   main="FEV1 Scatterplots")

# Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35865 -0.68975 -0.02921  0.59003  2.63869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.64134    0.09888  36.824  < 2e-16 ***
## GENO        -0.41683    0.14907  -2.796  0.00568 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared:  0.03799,    Adjusted R-squared:  0.03313 
## F-statistic: 7.819 on 1 and 198 DF,  p-value: 0.005681
# Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
## 
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23691 -0.69417 -0.01816  0.77985  2.36431 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9157     0.1081  36.213  < 2e-16 ***
## GENO         -0.2090     0.1467  -1.425    0.156    
## SEX          -0.7317     0.1456  -5.025 1.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared:  0.1473, Adjusted R-squared:  0.1386 
## F-statistic: 17.02 on 2 and 197 DF,  p-value: 1.526e-07
# Now examine regression diagnostics for this model
# Uisng Adjusted Linear Regression Model above -- afit
#
# single command to produce suggested diagnostic plots
plot(afit)

# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points

# To get all plots on one figure
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(afit)

par(mfrow=c(1,1)) # Change back to 1 x 1

# Alternate QQ Plot from the car Package
qqPlot(afit$residuals, main="Q-Q Plot")

## [1]  19 169
# Example -- Cigarette Data -- Influectional Points and Multicollinearity
# The Federal Trade Commission annually rates varieties of domestic
# cigarettes according to their tar, nicotine, and carbon monoxide
# content.  The United States Surgeon General considers each of these
# substances hazardous to a smoker's health.  Past studies have shown
# that increases in the tar and nicotine content of a cigarette are
# accompanied by an increase in the carbon monoxide emitted from the
# cigarette smoke.  
# The data presented here are taken from Mendenhall and Sincich (1992)
# and are a subset of the data produced by the Federal Trade Commission.

# Data Dictionary: 
# 1.  Brand    Brand of cigarette (25 brands)
# 2.  Tar      Tar content (mg)
# 3.    Nicotine Nicotine content (mg)
# 4.  Weight   Weight (g)
# 5.  CO       Carbon monoxide content (mg)

# Download and load the cigarette dataset used in lecture: 
download.file("http://www.duke.edu/~sgrambow/crp241data/cigarette.RData",
              destfile = "cigarette.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("cigarette.RData")

# display summary statistics for dataset
str(cigarette)
## 'data.frame':    25 obs. of  5 variables:
##  $ Brand   : Factor w/ 25 levels "Alpine","Benson&Hedges",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Tar     : num  14.1 16 29.8 8 4.1 15 8.8 12.4 16.6 14.9 ...
##  $ Nicotine: num  0.86 1.06 2.03 0.67 0.4 1.04 0.76 0.95 1.12 1.02 ...
##  $ Weight  : num  0.985 1.094 1.165 0.928 0.946 ...
##  $ CO      : num  13.6 16.6 23.5 10.2 5.4 15 9 12.3 16.3 15.4 ...
head(cigarette)
##           Brand  Tar Nicotine Weight   CO
## 1        Alpine 14.1     0.86 0.9853 13.6
## 2 Benson&Hedges 16.0     1.06 1.0938 16.6
## 3    BullDurham 29.8     2.03 1.1650 23.5
## 4   CamelLights  8.0     0.67 0.9280 10.2
## 5       Carlton  4.1     0.40 0.9462  5.4
## 6  Chesterfield 15.0     1.04 0.8885 15.0
summary(cigarette)
##            Brand         Tar           Nicotine          Weight      
##  Alpine       : 1   Min.   : 1.00   Min.   :0.1300   Min.   :0.7851  
##  Benson&Hedges: 1   1st Qu.: 8.60   1st Qu.:0.6900   1st Qu.:0.9225  
##  BullDurham   : 1   Median :12.80   Median :0.9000   Median :0.9573  
##  CamelLights  : 1   Mean   :12.22   Mean   :0.8764   Mean   :0.9703  
##  Carlton      : 1   3rd Qu.:15.10   3rd Qu.:1.0200   3rd Qu.:1.0070  
##  Chesterfield : 1   Max.   :29.80   Max.   :2.0300   Max.   :1.1650  
##  (Other)      :19                                                    
##        CO       
##  Min.   : 1.50  
##  1st Qu.:10.00  
##  Median :13.00  
##  Mean   :12.53  
##  3rd Qu.:15.40  
##  Max.   :23.50  
## 
# scatterplot matrix for cigarette data
pairs(~CO+Tar+Nicotine,main="Simple Scatterplot Matrix",data=cigarette)

scatterplotMatrix(~CO+Tar+Nicotine, data=cigarette, diagonal="histogram",
                  main="Cigarette Scatterplots")
## Warning in applyDefaults(diagonal, defaults = list(method =
## "adaptiveDensity"), : unnamed diag arguments, will be ignored

# Correlations
# Calculate Pearson correlations between each pair of variables
cor(cigarette$CO,cigarette$Tar,method="pearson")
## [1] 0.9574853
cor(cigarette$CO,cigarette$Nicotine,method="pearson")
## [1] 0.9259473
cor(cigarette$Tar,cigarette$Nicotine,method="pearson")
## [1] 0.9766076
# Simple Linear Regression with Tar
fit.CO.Tar <- lm(CO ~ Tar,data=cigarette)
summary(fit.CO.Tar)
## 
## Call:
## lm(formula = CO ~ Tar, data = cigarette)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1124 -0.7167 -0.3754  1.0091  2.5450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.74328    0.67521   4.063 0.000481 ***
## Tar          0.80098    0.05032  15.918 6.55e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.397 on 23 degrees of freedom
## Multiple R-squared:  0.9168, Adjusted R-squared:  0.9132 
## F-statistic: 253.4 on 1 and 23 DF,  p-value: 6.552e-14
# Simple Linear Regression with Nicotine
fit.CO.NIC <- lm(CO ~ Nicotine,data=cigarette)
summary(fit.CO.NIC, data=cigarette)
## 
## Call:
## lm(formula = CO ~ Nicotine, data = cigarette)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3273 -1.2228  0.2304  1.2700  3.9357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.6647     0.9936   1.675    0.107    
## Nicotine     12.3954     1.0542  11.759 3.31e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.828 on 23 degrees of freedom
## Multiple R-squared:  0.8574, Adjusted R-squared:  0.8512 
## F-statistic: 138.3 on 1 and 23 DF,  p-value: 3.312e-11
# Multiple Linear Regression with Tar & Nicotine
mfit.CO.TAR.NIC <- lm(CO~ Tar+Nicotine,data=cigarette)
summary(mfit.CO.TAR.NIC)
## 
## Call:
## lm(formula = CO ~ Tar + Nicotine, data = cigarette)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.89941 -0.78470 -0.00144  0.91585  2.43064 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.0896     0.8438   3.662 0.001371 ** 
## Tar           0.9625     0.2367   4.067 0.000512 ***
## Nicotine     -2.6463     3.7872  -0.699 0.492035    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.413 on 22 degrees of freedom
## Multiple R-squared:  0.9186, Adjusted R-squared:  0.9112 
## F-statistic: 124.1 on 2 and 22 DF,  p-value: 1.042e-12
# Now examine regression diagnostics for this model
# Uisng Adjusted Linear Regression Model above -- mfit.CO.TAR.NIC
#
# single command to produce suggested diagnostic plots
plot(mfit.CO.TAR.NIC)

# this will produce 4 plots
# 1. Residuals vs fitted to check linearity
# 2. Normal Q-Q to check normality of residuals
# 3. Scale-Location to check constant variance
# 4. Residuals vs Leverage to check for influential points

# remove outlier identified in leverage plot data point 3
cigarette_reduced <- cigarette[-3,]

# Multiple Linear Regression with Tar & Nicotine minus leverage point
mfit.CO.TAR.NIC.Reduced <- lm(CO~ Tar+Nicotine,data=cigarette_reduced)
summary(mfit.CO.TAR.NIC.Reduced)
## 
## Call:
## lm(formula = CO ~ Tar + Nicotine, data = cigarette_reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7661 -0.7877 -0.3204  1.0654  2.3736 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.3089     0.8483   1.543 0.137795    
## Tar           0.8918     0.1927   4.628 0.000145 ***
## Nicotine      0.6289     3.2034   0.196 0.846235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.144 on 21 degrees of freedom
## Multiple R-squared:  0.9336, Adjusted R-squared:  0.9273 
## F-statistic: 147.6 on 2 and 21 DF,  p-value: 4.304e-13
# check diagnostic plot again
plot(mfit.CO.TAR.NIC.Reduced)

# Need car package for multiollinearity function vif()
#install.packages('car')
library(car)
# Evaluate multicollinearity
vif(mfit.CO.TAR.NIC.Reduced) # variance inflation factors 
##      Tar Nicotine 
## 12.72247 12.72247
sqrt(vif(mfit.CO.TAR.NIC.Reduced)) > 2 # problem?
##      Tar Nicotine 
##     TRUE     TRUE
# Remove one of the collinear variables to fix this issue!

# End of Program