module2_day3_Rscript.R

# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 3 ~
# ~ Regression Diagnostics ~
# ~ And Other Stuff        ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# Muscle Contraction in Alcoholics: 
# This dataset contains maximum voluntary contraction (MVC) of quadriceps muscle, age, 
# and height, of 41 male alcoholics. Researchers were interested in investigating the 
# effect of both age and height on mean MVC.

# Data Set: mve

# Data Dictionary: 
# (1) age      age (years) 
# (2) height   height (cm)
# (3) mvc      Max voluntary contraction, quadriceps muscle (newtons)


# Download and load the data files 
download.file("http://www.duke.edu/~sgrambow/crp245data/mvc.RData",
              destfile = "mvc.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("mvc.RData")
#
# Summary and Visualization of Muscle Data
#

# summary for Y -- mvc
# smallest values
sort(unique(muscle$mvc), decreasing = FALSE)[1:5]

## [1]  74  98 137 147 196

head(sort(muscle$mvc))

## [1]  74  98 137 147 147 196

# largest values
sort(unique(muscle$mvc), decreasing = TRUE)[1:5]

## [1] 540 491 466 441 417

tail(sort(muscle$mvc))

## [1] 441 441 466 466 491 540

# descriptive statistics
summary(muscle$mvc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    74.0   270.0   343.0   322.1   402.0   540.0

sd(muscle$mvc)

## [1] 112.1767

# fivenum() generates min, Q1, median, Q3, max
fivenum(muscle$mvc)

## [1]  74 270 343 402 540

# Visualization of Y
hist(muscle$mvc, # histogram
     col="blue", # column color
     border="black",
     prob = TRUE, # show densities instead of frequencies
     xlab = "Newtons",
     main = "Histogram w Density Plot")
lines(density(muscle$mvc), # density plot
      lwd = 2, # thickness of line
      col = "red")

# summary for X1 -- age -- in years
# smallest values
sort(unique(muscle$age), decreasing = FALSE)[1:5]

## [1] 24 27 28 31 32

# largest values
sort(unique(muscle$age), decreasing = TRUE)[1:5]

## [1] 65 62 61 58 55

# descriptive statistics
summary(muscle$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   34.00   41.00   43.54   53.00   65.00

sd(muscle$age)

## [1] 11.32717

# fivenum() generates min, Q1, median, Q3, max
fivenum(muscle$age)

## [1] 24 34 41 53 65

# Visualization of Y
hist(muscle$age, # histogram
     col="blue", # column color
     border="black",
     prob = TRUE, # show densities instead of frequencies
     xlab = "Age in Years",
     main = "Histogram w Density Plot")
lines(density(muscle$age), # density plot
      lwd = 2, # thickness of line
      col = "red")

cor.test(muscle$age,muscle$mvc)

## 
##  Pearson's product-moment correlation
## 
## data:  muscle$age and muscle$mvc
## t = -2.8641, df = 39, p-value = 0.006701
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6421620 -0.1252862
## sample estimates:
##        cor 
## -0.4168687

# Scatterplot with Y
# link to web page about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot 
library(car)

## Loading required package: carData

scatterplot(mvc ~ age, data=muscle,
            xlab="Age", ylab="MVC",
            main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line 
# -- the smoothed conditional spread 
# -- the non-parametric regression smootH


# summary for X2 -- height -- height in cm
# smallest values
#
#        WE WILL FILL THIS IN      #
#

# SCATTERPLOT MATRIX MVC, AGE, HEIGHT
# includes pearson correlations
# just change the data frame name in the
# pairs command
# Correlation panel
upper.panel<-function(x, y){
  points(x,y, pch=19)
  r <- round(cor(x, y), digits=2)
  txt <- paste0("R = ", r)
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  text(0.5, 0.9, txt)}
# Create the plots
pairs(muscle, lower.panel = NULL, 
      upper.panel = upper.panel)

#
# REGRESSION MODELS              #
#

# Simple Linear Regression 
# Model mvc (Y) ~ age (X1)
ufit1 <- lm(muscle$mvc ~ muscle$age)
summary(ufit1)

## 
## Call:
## lm(formula = muscle$mvc ~ muscle$age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -226.88  -67.05   10.41   64.31  207.41 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  501.858     64.794   7.745 2.08e-09 ***
## muscle$age    -4.128      1.441  -2.864   0.0067 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.3 on 39 degrees of freedom
## Multiple R-squared:  0.1738, Adjusted R-squared:  0.1526 
## F-statistic: 8.203 on 1 and 39 DF,  p-value: 0.006701

# Simple Linear Regression 
# Model mvc (Y) ~ height (X2)
ufit2 <- lm(muscle$mvc ~ muscle$height)
summary(ufit2)

## 
## Call:
## lm(formula = muscle$mvc ~ muscle$height)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -228.446  -43.229    0.729   49.134  195.757 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   -907.626    426.612  -2.128  0.03975 * 
## muscle$height    7.203      2.497   2.885  0.00635 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.1 on 39 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1547 
## F-statistic: 8.321 on 1 and 39 DF,  p-value: 0.006351

# Multiple Linear Regression
# Model:  mvc (Y) ~ age (X1) + height (X2)
mfit <- lm(mvc ~ age + height, data=muscle)
summary(mfit)

## 
## Call:
## lm(formula = mvc ~ age + height, data = muscle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -220.523  -33.483    5.665   50.917  170.841 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -465.626    460.333  -1.011   0.3182  
## age           -3.075      1.467  -2.096   0.0428 *
## height         5.398      2.545   2.121   0.0405 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98.92 on 38 degrees of freedom
## Multiple R-squared:  0.2612, Adjusted R-squared:  0.2224 
## F-statistic: 6.719 on 2 and 38 DF,  p-value: 0.003173

# The ANOVA Table for Multiple Regression
# ANOVA Table can provide overall summary
# of a multiple regression analysis
# Want to load the CAR package
# to use its Anova function
# which calculates Type III sums of squares
# which are generally best
# nice ANOVA tutorial here:
# http://www.understandingdata.net/2017/05/11/anova-tables-in-r/#APAtable

library(car)
# generate ANOVA table for our model
# difference between Type I, Type II, and Type III Sums of Squares
# https://rcompanion.org/rcompanion/d_04.html
Anova(mfit,type = "II")

## Anova Table (Type II tests)
## 
## Response: mvc
##           Sum Sq Df F value  Pr(>F)  
## age        42985  1  4.3927 0.04281 *
## height     44024  1  4.4989 0.04050 *
## Residuals 371849 38                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Other Useful Regression functions/quantities
# that are contained in the fit object created
# by regression
coefficients(mfit) # model coefficients

## (Intercept)         age      height 
## -465.626228   -3.075405    5.398182

confint(mfit, level=0.95) # CIs for model parameters

##                     2.5 %      97.5 %
## (Intercept) -1397.5226142 466.2701576
## age            -6.0459158  -0.1048933
## height          0.2460489  10.5503147

fitted(mfit)[1:10] # predicted values

##        1        2        3        4        5        6        7        8 
## 356.6622 396.0197 382.1479 392.9443 367.5235 367.5235 299.6699 364.4481 
##        9       10 
## 402.2354 391.4390

residuals(mfit)[1:10] # residuals

##          1          2          3          4          5          6          7 
##  109.33776  -92.01966  -39.14789   11.05574 -220.52350  -73.52350   92.33009 
##          8          9         10 
## -217.44809 -132.23537   20.56100

# We can generate predictions from the model
predict(mfit,data.frame(age=24,height=166),interval = "confidence")

##        fit      lwr      upr
## 1 356.6622 279.8663 433.4582

# We can also make a plot of observed vs predicted
plot(predict(mfit),muscle$mvc,
     xlab="predicted",ylab="actual")
# we can use abline(a,b)
# which intercept/slope form
# a=0 is y-intercept -- indicating (0,0)
# b=1 is slope of 1 -- 
# so abline(a=0,b=1) is the line y=x 
# if observed = predicted, all points should be on this line
abline(a=0,b=1)

# Now Check Model Diagnostics
# Model (mfit):  mvc (Y) ~ age (X1) + height (X2)
plot(mfit)

# Additional Diagnostics focused
# on measures of influence
library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

# Cook's D Chart 
ols_plot_cooksd_chart(mfit)

# DFBETAs Panel
ols_plot_dfbetas(mfit)

# DFFITs Plot
ols_plot_dffits(mfit)

# Standardized Residual Chart
ols_plot_resid_stand(mfit)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(mfit)

#
# Lets return to the BMI dataset 
#
# Download and load the BMI dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/BMIdata.RData",
              destfile = "BMIdata.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("BMIdata.RData")

# examine data structure
str(fram1)

## 'data.frame':    4434 obs. of  38 variables:
##  $ SEX     : int  1 2 1 2 2 2 2 2 1 1 ...
##  $ RANDID  : int  2448 6238 9428 10552 11252 11263 12629 12806 14367 16365 ...
##  $ TOTCHOL : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ AGE     : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ SYSBP   : num  106 121 128 150 130 ...
##  $ DIABP   : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ CURSMOKE: int  0 0 1 1 1 0 0 1 0 1 ...
##  $ CIGPDAY : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BMI     : num  27 28.7 25.3 28.6 23.1 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE: int  80 95 75 65 85 77 60 79 76 93 ...
##  $ GLUCOSE : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ PREVCHD : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVAP  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVMI  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVSTRK: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVHYP : int  0 0 0 1 0 1 0 0 1 1 ...
##  $ TIME    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PERIOD  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HDLC    : logi  NA NA NA NA NA NA ...
##  $ LDLC    : logi  NA NA NA NA NA NA ...
##  $ DEATH   : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ HOSPMI  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ ANYCHD  : int  1 0 0 0 0 1 1 0 0 0 ...
##  $ STROKE  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CVD     : int  1 0 0 1 0 1 0 0 0 0 ...
##  $ HYPERTEN: int  0 0 0 1 1 1 1 1 1 1 ...
##  $ TIMEAP  : int  8766 8766 8766 2956 8766 8766 373 8766 8766 8766 ...
##  $ TIMEMI  : int  6438 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC: int  6438 8766 8766 2956 8766 5719 8766 8766 8766 8766 ...
##  $ TIMECHD : int  6438 8766 8766 2956 8766 5719 373 8766 8766 8766 ...
##  $ TIMESTRK: int  8766 8766 8766 2089 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD : int  6438 8766 8766 2089 8766 5719 8766 8766 8766 8766 ...
##  $ TIMEDTH : int  8766 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP : int  8766 8766 8766 0 4285 0 2212 8679 0 0 ...

# we will use the ifelse function
# format is ifelse(a,b,c)
# a is the condition of interest (here male which is 1)
# b is the value you want to assign to those 
#   who have the condition of interest (we want to replace
#   (1 with a 1).
# c is the value for those who do not have the condition
#   (here that is 2=females and we map them to 0)
# NOTE: be sure there are no other possible values!!!

fram1$MALE <- ifelse(fram1$SEX==1,1,0)


# Check recoding:
head(fram1[c("SEX","MALE")],20)

##    SEX MALE
## 1    1    1
## 2    2    0
## 3    1    1
## 4    2    0
## 5    2    0
## 6    2    0
## 7    2    0
## 8    2    0
## 9    1    1
## 10   1    1
## 11   2    0
## 12   2    0
## 13   1    1
## 14   2    0
## 15   2    0
## 16   2    0
## 17   1    1
## 18   2    0
## 19   2    0
## 20   1    1

tail(fram1[c("SEX","MALE")],20)

##      SEX MALE
## 4415   1    1
## 4416   1    1
## 4417   1    1
## 4418   1    1
## 4419   1    1
## 4420   1    1
## 4421   1    1
## 4422   1    1
## 4423   2    0
## 4424   2    0
## 4425   2    0
## 4426   1    1
## 4427   1    1
## 4428   1    1
## 4429   1    1
## 4430   2    0
## 4431   2    0
## 4432   2    0
## 4433   1    1
## 4434   2    0

#
# Now lets use a different approach to code categorical
# recoding tutorial
# https://www.theanalysisfactor.com/r-tutorial-recoding-values/
# Categorical BMI making it 4 levels like this
# 0 = 0-normal
# 1 = 1-underweight
# 2 = 2-overweight
# 3 = 3-obese
fram1$cBMI <- ifelse(fram1$BMI<18.5, "Under", 
                         ifelse(fram1$BMI>=18.5 & fram1$BMI<25, "Norm",
                                ifelse(fram1$BMI>=25 & fram1$BMI<30, "Over",
                                       ifelse(fram1$BMI>=30, "Obese",NA))))

# Check recoding:
head(fram1[c("BMI","cBMI")],20)

##      BMI  cBMI
## 1  26.97  Over
## 2  28.73  Over
## 3  25.34  Over
## 4  28.58  Over
## 5  23.10  Norm
## 6  30.30 Obese
## 7  33.11 Obese
## 8  21.68  Norm
## 9  26.36  Over
## 10 23.61  Norm
## 11 22.91  Norm
## 12 27.64  Over
## 13 26.31  Over
## 14 31.31 Obese
## 15 22.35  Norm
## 16 21.35  Norm
## 17 22.37  Norm
## 18 23.38  Norm
## 19 23.24  Norm
## 20 26.88  Over

tail(fram1[c("BMI","cBMI")],20)

##        BMI  cBMI
## 4415 21.85  Norm
## 4416 26.70  Over
## 4417 21.68  Norm
## 4418 25.23  Over
## 4419 24.24  Norm
## 4420 32.18 Obese
## 4421 26.05  Over
## 4422 25.62  Over
## 4423 43.67 Obese
## 4424 25.60  Over
## 4425 22.89  Norm
## 4426 24.96  Norm
## 4427 23.14  Norm
## 4428 25.97  Over
## 4429 19.71  Norm
## 4430 22.00  Norm
## 4431 19.16  Norm
## 4432 21.47  Norm
## 4433 25.60  Over
## 4434 20.91  Norm

# Now fit this model -- fit5.mlr
# fit a multiple linear regression model with 
# Y = systolic blood pressure (SYSBP)
# X1 = cBMI (categorized BMI)
# X2 = age (AGE)
# X3 = sex (MALE); 0 = female; 1 = male
# X4 = antihypertensive medication use (BPMEDS)

fit5.mlr <- lm(SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = fram1)
summary(fit5.mlr)

## 
## Call:
## lm(formula = SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = fram1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## cBMIObese   14.85372    0.92743  16.016  < 2e-16 ***
## cBMIOver     7.20117    0.64426  11.177  < 2e-16 ***
## cBMIUnder   -6.46970    2.58809  -2.500   0.0125 *  
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16

confint(fit5.mlr)

##                   2.5 %     97.5 %
## (Intercept)  81.2643742 88.0857712
## cBMIObese    13.0354799 16.6719508
## cBMIOver      5.9381004  8.4642479
## cBMIUnder   -11.5436658 -1.3957319
## AGE           0.8056299  0.9401465
## MALE         -3.5771345 -1.2323013
## BPMEDS       21.7702235 28.2728012

# Generate ANOVA table for model so we can
# determine the p-value for Categorized BMI variable in the model.
# Use car package Anova function
library(car)
# generate ANOVA table for our model
# difference between Type I, Type II, and Type III Sums of Squares
# https://rcompanion.org/rcompanion/d_04.html
Anova(fit5.mlr,type = "II")

## Anova Table (Type II tests)
## 
## Response: SYSBP
##            Sum Sq   Df F value    Pr(>F)    
## cBMI       115279    3  103.81 < 2.2e-16 ***
## AGE        239647    1  647.39 < 2.2e-16 ***
## MALE         5986    1   16.17 5.889e-05 ***
## BPMEDS      84268    1  227.64 < 2.2e-16 ***
## Residuals 1609157 4347                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# the resulting p-value is very small and significant

# Check Diagnostics for this model
plot(fit5.mlr)

# Check Measures of Influence
# install and load Olsrr package which offers the following tools to detect influential observations:
# -- Cook's D Chart
# -- DFBETAs Panel
# -- DFFITs Plot
# -- Standardized Residual Chart
# -- Studentized Residuals vs Leverage Plot
# install.packages("olsrr")
library(olsrr)

# Cook's D Chart 
ols_plot_cooksd_chart(fit5.mlr)

# DFBETAs Panel
ols_plot_dfbetas(fit5.mlr)

# DFFITs Plot
ols_plot_dffits(fit5.mlr)

# Standardized Residual Chart
ols_plot_resid_stand(fit5.mlr)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(fit5.mlr)

# What if our categorical variable isnt a character
# variable but is instead a factor variable
fram1$fBMI <- factor(fram1$cBMI)
str(fram1)

## 'data.frame':    4434 obs. of  41 variables:
##  $ SEX     : int  1 2 1 2 2 2 2 2 1 1 ...
##  $ RANDID  : int  2448 6238 9428 10552 11252 11263 12629 12806 14367 16365 ...
##  $ TOTCHOL : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ AGE     : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ SYSBP   : num  106 121 128 150 130 ...
##  $ DIABP   : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ CURSMOKE: int  0 0 1 1 1 0 0 1 0 1 ...
##  $ CIGPDAY : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BMI     : num  27 28.7 25.3 28.6 23.1 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE: int  80 95 75 65 85 77 60 79 76 93 ...
##  $ GLUCOSE : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ PREVCHD : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVAP  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVMI  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVSTRK: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVHYP : int  0 0 0 1 0 1 0 0 1 1 ...
##  $ TIME    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PERIOD  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HDLC    : logi  NA NA NA NA NA NA ...
##  $ LDLC    : logi  NA NA NA NA NA NA ...
##  $ DEATH   : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ HOSPMI  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ ANYCHD  : int  1 0 0 0 0 1 1 0 0 0 ...
##  $ STROKE  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CVD     : int  1 0 0 1 0 1 0 0 0 0 ...
##  $ HYPERTEN: int  0 0 0 1 1 1 1 1 1 1 ...
##  $ TIMEAP  : int  8766 8766 8766 2956 8766 8766 373 8766 8766 8766 ...
##  $ TIMEMI  : int  6438 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC: int  6438 8766 8766 2956 8766 5719 8766 8766 8766 8766 ...
##  $ TIMECHD : int  6438 8766 8766 2956 8766 5719 373 8766 8766 8766 ...
##  $ TIMESTRK: int  8766 8766 8766 2089 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD : int  6438 8766 8766 2089 8766 5719 8766 8766 8766 8766 ...
##  $ TIMEDTH : int  8766 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP : int  8766 8766 8766 0 4285 0 2212 8679 0 0 ...
##  $ MALE    : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ cBMI    : chr  "Over" "Over" "Over" "Over" ...
##  $ fBMI    : Factor w/ 4 levels "Norm","Obese",..: 3 3 3 3 1 2 2 1 3 1 ...

levels(fram1$fBMI)

## [1] "Norm"  "Obese" "Over"  "Under"

# lets say we wanted to change the reference level
# of BMI to be OBESE.  How would we do that?
# We can use the relevel function with factors
# syntax: new.variable <- relevel(old.variable, ref="c")
# lets call the new variable obese.cBMI

# Then
fram1$obese.fBMI <-relevel(fram1$fBMI,ref="Obese")
levels(fram1$obese.fBMI)

## [1] "Obese" "Norm"  "Over"  "Under"

levels(fram1$fBMI)

## [1] "Norm"  "Obese" "Over"  "Under"

# Here is the original factor version
fBMI.fit <- lm(SYSBP ~ fBMI + AGE + MALE + BPMEDS, data = fram1)
summary(fBMI.fit)

## 
## Call:
## lm(formula = SYSBP ~ fBMI + AGE + MALE + BPMEDS, data = fram1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## fBMIObese   14.85372    0.92743  16.016  < 2e-16 ***
## fBMIOver     7.20117    0.64426  11.177  < 2e-16 ***
## fBMIUnder   -6.46970    2.58809  -2.500   0.0125 *  
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16

# Here is the releveled factor version
obese.fBMI.fit <- lm(SYSBP ~ obese.fBMI + AGE + MALE + BPMEDS, data = fram1)
summary(obese.fBMI.fit)

## 
## Call:
## lm(formula = SYSBP ~ obese.fBMI + AGE + MALE + BPMEDS, data = fram1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      99.52879    1.95167  50.997  < 2e-16 ***
## obese.fBMINorm  -14.85372    0.92743 -16.016  < 2e-16 ***
## obese.fBMIOver   -7.65254    0.92892  -8.238 2.29e-16 ***
## obese.fBMIUnder -21.32341    2.67961  -7.958 2.22e-15 ***
## AGE               0.87289    0.03431  25.444  < 2e-16 ***
## MALE             -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS           25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16

# Nice UCLA tutorial on R contrast coding options
# https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/


#
# Lets return to the BRFSS dataset 
#

# BRFSS Data:
# The Behavioral Risk Factor Surveillance System (BRFSS) 
# is the nation's premier system of health-related telephone 
# surveys that collect state data about U.S. residents regarding 
# their health-related risk behaviors, chronic health conditions, 
# and use of preventive services. 

# 2018 Cohort Charcteristics 
# Overall cohort size = 437436 observations of 275 variables
# Restricting cohort to NC = 4729 observations
#      ***  subset(brfss2018,`_STATE`==37)  ***
# Restricting cohort to those who drank at least one drink of 
#      alcohol in last 30 days = 2084 observations
#      ***  subset(nc.brfss,DRNKANY5==1)  ***
# Restricting cohort to low-quantity (non-heavy) alcohol
#             drinkers = 1759 observations
#      ***  subset(l30.nc.brfss,`_RFDRHV6`==1)  ***

# We have brfssm1d1 with 1759 observations and have included the following variables:
# 1.  BMI             (units=kilograms per meter squared)
# 2.  DAYS.DRINKING   (# of Days on which had at least 1 alcoholic drink in past 30 days)
# 3.  AGE             (in years, collapsed above 80)
# 4.  SLEEP           (average # hours sleep in a 24 hour period)
# 5.  EXERCISE.YN     (Y/N participate in any physical activites/exercise during past month)
# 6.  FEMALE          (Sex at birth; 0=MALE, 1=FEMALE)

# Download and load the data file = brfssm1d1
download.file("http://www.duke.edu/~sgrambow/crp241data/brfssm1d1.RData",
              destfile = "brfssm1d1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("brfssm1d1.RData")

# Lets fit a multiple linear regression model-- bmi.fit2
# fit a multiple linear regression model with 
# Y = BMI
# X1 = DAYS.DRINKING 
# X2 = AGE
# X3 = SLEEP
# X4 = EXERCISE.YN
# X5 = FEMALE

bmi.fit2 <- lm(BMI ~ DAYS.DRINKING +AGE + SLEEP + EXERCISE.YN + FEMALE, data=brfssm1d1)
summary(bmi.fit2)

## 
## Call:
## lm(formula = BMI ~ DAYS.DRINKING + AGE + SLEEP + EXERCISE.YN + 
##     FEMALE, data = brfssm1d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6206  -3.7051  -0.8026   2.4948  29.0633 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   24.800838   0.903225  27.458  < 2e-16 ***
## DAYS.DRINKING -0.098593   0.019413  -5.079 4.22e-07 ***
## AGE            0.017992   0.008024   2.242   0.0251 *  
## SLEEP         -0.044673   0.105855  -0.422   0.6731    
## EXERCISE.YN    2.814573   0.390835   7.201 8.95e-13 ***
## FEMALE        -0.497176   0.280881  -1.770   0.0769 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.664 on 1683 degrees of freedom
##   (70 observations deleted due to missingness)
## Multiple R-squared:  0.04963,    Adjusted R-squared:  0.04681 
## F-statistic: 17.58 on 5 and 1683 DF,  p-value: < 2.2e-16

# Generate standard diagnostic plots
plot(bmi.fit2)

# Additional Diagnostics focused
# on measures of influence
library(olsrr)
# Cook's D Chart 
ols_plot_cooksd_chart(bmi.fit2)

# DFBETAs Panel
ols_plot_dfbetas(bmi.fit2)

# DFFITs Plot
ols_plot_dffits(bmi.fit2)

# Standardized Residual Chart
ols_plot_resid_stand(bmi.fit2)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(bmi.fit2)

#


# End of Program

module2_day3_Rscript.R

sgrambow

2020-02-19