# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 3 ~
# ~ Regression Diagnostics ~
# ~ And Other Stuff ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Muscle Contraction in Alcoholics:
# This dataset contains maximum voluntary contraction (MVC) of quadriceps muscle, age,
# and height, of 41 male alcoholics. Researchers were interested in investigating the
# effect of both age and height on mean MVC.
# Data Set: mve
# Data Dictionary:
# (1) age age (years)
# (2) height height (cm)
# (3) mvc Max voluntary contraction, quadriceps muscle (newtons)
# Download and load the data files
download.file("http://www.duke.edu/~sgrambow/crp245data/mvc.RData",
destfile = "mvc.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("mvc.RData")
#
# Summary and Visualization of Muscle Data
#
# summary for Y -- mvc
# smallest values
sort(unique(muscle$mvc), decreasing = FALSE)[1:5]
## [1] 74 98 137 147 196
head(sort(muscle$mvc))
## [1] 74 98 137 147 147 196
# largest values
sort(unique(muscle$mvc), decreasing = TRUE)[1:5]
## [1] 540 491 466 441 417
tail(sort(muscle$mvc))
## [1] 441 441 466 466 491 540
# descriptive statistics
summary(muscle$mvc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.0 270.0 343.0 322.1 402.0 540.0
sd(muscle$mvc)
## [1] 112.1767
# fivenum() generates min, Q1, median, Q3, max
fivenum(muscle$mvc)
## [1] 74 270 343 402 540
# Visualization of Y
hist(muscle$mvc, # histogram
col="blue", # column color
border="black",
prob = TRUE, # show densities instead of frequencies
xlab = "Newtons",
main = "Histogram w Density Plot")
lines(density(muscle$mvc), # density plot
lwd = 2, # thickness of line
col = "red")

# summary for X1 -- age -- in years
# smallest values
sort(unique(muscle$age), decreasing = FALSE)[1:5]
## [1] 24 27 28 31 32
# largest values
sort(unique(muscle$age), decreasing = TRUE)[1:5]
## [1] 65 62 61 58 55
# descriptive statistics
summary(muscle$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 34.00 41.00 43.54 53.00 65.00
sd(muscle$age)
## [1] 11.32717
# fivenum() generates min, Q1, median, Q3, max
fivenum(muscle$age)
## [1] 24 34 41 53 65
# Visualization of Y
hist(muscle$age, # histogram
col="blue", # column color
border="black",
prob = TRUE, # show densities instead of frequencies
xlab = "Age in Years",
main = "Histogram w Density Plot")
lines(density(muscle$age), # density plot
lwd = 2, # thickness of line
col = "red")

cor.test(muscle$age,muscle$mvc)
##
## Pearson's product-moment correlation
##
## data: muscle$age and muscle$mvc
## t = -2.8641, df = 39, p-value = 0.006701
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6421620 -0.1252862
## sample estimates:
## cor
## -0.4168687
# Scatterplot with Y
# link to web page about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot
library(car)
## Loading required package: carData
scatterplot(mvc ~ age, data=muscle,
xlab="Age", ylab="MVC",
main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line
# -- the smoothed conditional spread
# -- the non-parametric regression smootH
# summary for X2 -- height -- height in cm
# smallest values
#
# WE WILL FILL THIS IN #
#
# SCATTERPLOT MATRIX MVC, AGE, HEIGHT
# includes pearson correlations
# just change the data frame name in the
# pairs command
# Correlation panel
upper.panel<-function(x, y){
points(x,y, pch=19)
r <- round(cor(x, y), digits=2)
txt <- paste0("R = ", r)
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
text(0.5, 0.9, txt)}
# Create the plots
pairs(muscle, lower.panel = NULL,
upper.panel = upper.panel)

#
# REGRESSION MODELS #
#
# Simple Linear Regression
# Model mvc (Y) ~ age (X1)
ufit1 <- lm(muscle$mvc ~ muscle$age)
summary(ufit1)
##
## Call:
## lm(formula = muscle$mvc ~ muscle$age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.88 -67.05 10.41 64.31 207.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 501.858 64.794 7.745 2.08e-09 ***
## muscle$age -4.128 1.441 -2.864 0.0067 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.3 on 39 degrees of freedom
## Multiple R-squared: 0.1738, Adjusted R-squared: 0.1526
## F-statistic: 8.203 on 1 and 39 DF, p-value: 0.006701
# Simple Linear Regression
# Model mvc (Y) ~ height (X2)
ufit2 <- lm(muscle$mvc ~ muscle$height)
summary(ufit2)
##
## Call:
## lm(formula = muscle$mvc ~ muscle$height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.446 -43.229 0.729 49.134 195.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -907.626 426.612 -2.128 0.03975 *
## muscle$height 7.203 2.497 2.885 0.00635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 39 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1547
## F-statistic: 8.321 on 1 and 39 DF, p-value: 0.006351
# Multiple Linear Regression
# Model: mvc (Y) ~ age (X1) + height (X2)
mfit <- lm(mvc ~ age + height, data=muscle)
summary(mfit)
##
## Call:
## lm(formula = mvc ~ age + height, data = muscle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.523 -33.483 5.665 50.917 170.841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -465.626 460.333 -1.011 0.3182
## age -3.075 1.467 -2.096 0.0428 *
## height 5.398 2.545 2.121 0.0405 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.92 on 38 degrees of freedom
## Multiple R-squared: 0.2612, Adjusted R-squared: 0.2224
## F-statistic: 6.719 on 2 and 38 DF, p-value: 0.003173
# The ANOVA Table for Multiple Regression
# ANOVA Table can provide overall summary
# of a multiple regression analysis
# Want to load the CAR package
# to use its Anova function
# which calculates Type III sums of squares
# which are generally best
# nice ANOVA tutorial here:
# http://www.understandingdata.net/2017/05/11/anova-tables-in-r/#APAtable
library(car)
# generate ANOVA table for our model
# difference between Type I, Type II, and Type III Sums of Squares
# https://rcompanion.org/rcompanion/d_04.html
Anova(mfit,type = "II")
## Anova Table (Type II tests)
##
## Response: mvc
## Sum Sq Df F value Pr(>F)
## age 42985 1 4.3927 0.04281 *
## height 44024 1 4.4989 0.04050 *
## Residuals 371849 38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Other Useful Regression functions/quantities
# that are contained in the fit object created
# by regression
coefficients(mfit) # model coefficients
## (Intercept) age height
## -465.626228 -3.075405 5.398182
confint(mfit, level=0.95) # CIs for model parameters
## 2.5 % 97.5 %
## (Intercept) -1397.5226142 466.2701576
## age -6.0459158 -0.1048933
## height 0.2460489 10.5503147
fitted(mfit)[1:10] # predicted values
## 1 2 3 4 5 6 7 8
## 356.6622 396.0197 382.1479 392.9443 367.5235 367.5235 299.6699 364.4481
## 9 10
## 402.2354 391.4390
residuals(mfit)[1:10] # residuals
## 1 2 3 4 5 6 7
## 109.33776 -92.01966 -39.14789 11.05574 -220.52350 -73.52350 92.33009
## 8 9 10
## -217.44809 -132.23537 20.56100
# We can generate predictions from the model
predict(mfit,data.frame(age=24,height=166),interval = "confidence")
## fit lwr upr
## 1 356.6622 279.8663 433.4582
# We can also make a plot of observed vs predicted
plot(predict(mfit),muscle$mvc,
xlab="predicted",ylab="actual")
# we can use abline(a,b)
# which intercept/slope form
# a=0 is y-intercept -- indicating (0,0)
# b=1 is slope of 1 --
# so abline(a=0,b=1) is the line y=x
# if observed = predicted, all points should be on this line
abline(a=0,b=1)

# Now Check Model Diagnostics
# Model (mfit): mvc (Y) ~ age (X1) + height (X2)
plot(mfit)



# Additional Diagnostics focused
# on measures of influence
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers

# Cook's D Chart
ols_plot_cooksd_chart(mfit)

# DFBETAs Panel
ols_plot_dfbetas(mfit)

# DFFITs Plot
ols_plot_dffits(mfit)

# Standardized Residual Chart
ols_plot_resid_stand(mfit)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(mfit)

#
# Lets return to the BMI dataset
#
# Download and load the BMI dataset:
download.file("http://www.duke.edu/~sgrambow/crp241data/BMIdata.RData",
destfile = "BMIdata.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("BMIdata.RData")
# examine data structure
str(fram1)
## 'data.frame': 4434 obs. of 38 variables:
## $ SEX : int 1 2 1 2 2 2 2 2 1 1 ...
## $ RANDID : int 2448 6238 9428 10552 11252 11263 12629 12806 14367 16365 ...
## $ TOTCHOL : int 195 250 245 225 285 228 205 313 260 225 ...
## $ AGE : int 39 46 48 61 46 43 63 45 52 43 ...
## $ SYSBP : num 106 121 128 150 130 ...
## $ DIABP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ CURSMOKE: int 0 0 1 1 1 0 0 1 0 1 ...
## $ CIGPDAY : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE: int 80 95 75 65 85 77 60 79 76 93 ...
## $ GLUCOSE : int 77 76 70 103 85 99 85 78 79 88 ...
## $ PREVCHD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVAP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVSTRK: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVHYP : int 0 0 0 1 0 1 0 0 1 1 ...
## $ TIME : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PERIOD : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HDLC : logi NA NA NA NA NA NA ...
## $ LDLC : logi NA NA NA NA NA NA ...
## $ DEATH : int 0 0 0 1 0 0 0 0 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 1 0 0 0 ...
## $ HOSPMI : int 1 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 0 0 0 0 1 0 0 0 0 ...
## $ ANYCHD : int 1 0 0 0 0 1 1 0 0 0 ...
## $ STROKE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ CVD : int 1 0 0 1 0 1 0 0 0 0 ...
## $ HYPERTEN: int 0 0 0 1 1 1 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 2956 8766 8766 373 8766 8766 8766 ...
## $ TIMEMI : int 6438 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMIFC: int 6438 8766 8766 2956 8766 5719 8766 8766 8766 8766 ...
## $ TIMECHD : int 6438 8766 8766 2956 8766 5719 373 8766 8766 8766 ...
## $ TIMESTRK: int 8766 8766 8766 2089 8766 8766 8766 8766 8766 8766 ...
## $ TIMECVD : int 6438 8766 8766 2089 8766 5719 8766 8766 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
## $ TIMEHYP : int 8766 8766 8766 0 4285 0 2212 8679 0 0 ...
# we will use the ifelse function
# format is ifelse(a,b,c)
# a is the condition of interest (here male which is 1)
# b is the value you want to assign to those
# who have the condition of interest (we want to replace
# (1 with a 1).
# c is the value for those who do not have the condition
# (here that is 2=females and we map them to 0)
# NOTE: be sure there are no other possible values!!!
fram1$MALE <- ifelse(fram1$SEX==1,1,0)
# Check recoding:
head(fram1[c("SEX","MALE")],20)
## SEX MALE
## 1 1 1
## 2 2 0
## 3 1 1
## 4 2 0
## 5 2 0
## 6 2 0
## 7 2 0
## 8 2 0
## 9 1 1
## 10 1 1
## 11 2 0
## 12 2 0
## 13 1 1
## 14 2 0
## 15 2 0
## 16 2 0
## 17 1 1
## 18 2 0
## 19 2 0
## 20 1 1
tail(fram1[c("SEX","MALE")],20)
## SEX MALE
## 4415 1 1
## 4416 1 1
## 4417 1 1
## 4418 1 1
## 4419 1 1
## 4420 1 1
## 4421 1 1
## 4422 1 1
## 4423 2 0
## 4424 2 0
## 4425 2 0
## 4426 1 1
## 4427 1 1
## 4428 1 1
## 4429 1 1
## 4430 2 0
## 4431 2 0
## 4432 2 0
## 4433 1 1
## 4434 2 0
#
# Now lets use a different approach to code categorical
# recoding tutorial
# https://www.theanalysisfactor.com/r-tutorial-recoding-values/
# Categorical BMI making it 4 levels like this
# 0 = 0-normal
# 1 = 1-underweight
# 2 = 2-overweight
# 3 = 3-obese
fram1$cBMI <- ifelse(fram1$BMI<18.5, "Under",
ifelse(fram1$BMI>=18.5 & fram1$BMI<25, "Norm",
ifelse(fram1$BMI>=25 & fram1$BMI<30, "Over",
ifelse(fram1$BMI>=30, "Obese",NA))))
# Check recoding:
head(fram1[c("BMI","cBMI")],20)
## BMI cBMI
## 1 26.97 Over
## 2 28.73 Over
## 3 25.34 Over
## 4 28.58 Over
## 5 23.10 Norm
## 6 30.30 Obese
## 7 33.11 Obese
## 8 21.68 Norm
## 9 26.36 Over
## 10 23.61 Norm
## 11 22.91 Norm
## 12 27.64 Over
## 13 26.31 Over
## 14 31.31 Obese
## 15 22.35 Norm
## 16 21.35 Norm
## 17 22.37 Norm
## 18 23.38 Norm
## 19 23.24 Norm
## 20 26.88 Over
tail(fram1[c("BMI","cBMI")],20)
## BMI cBMI
## 4415 21.85 Norm
## 4416 26.70 Over
## 4417 21.68 Norm
## 4418 25.23 Over
## 4419 24.24 Norm
## 4420 32.18 Obese
## 4421 26.05 Over
## 4422 25.62 Over
## 4423 43.67 Obese
## 4424 25.60 Over
## 4425 22.89 Norm
## 4426 24.96 Norm
## 4427 23.14 Norm
## 4428 25.97 Over
## 4429 19.71 Norm
## 4430 22.00 Norm
## 4431 19.16 Norm
## 4432 21.47 Norm
## 4433 25.60 Over
## 4434 20.91 Norm
# Now fit this model -- fit5.mlr
# fit a multiple linear regression model with
# Y = systolic blood pressure (SYSBP)
# X1 = cBMI (categorized BMI)
# X2 = age (AGE)
# X3 = sex (MALE); 0 = female; 1 = male
# X4 = antihypertensive medication use (BPMEDS)
fit5.mlr <- lm(SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = fram1)
summary(fit5.mlr)
##
## Call:
## lm(formula = SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = fram1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## cBMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## cBMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## cBMIUnder -6.46970 2.58809 -2.500 0.0125 *
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
confint(fit5.mlr)
## 2.5 % 97.5 %
## (Intercept) 81.2643742 88.0857712
## cBMIObese 13.0354799 16.6719508
## cBMIOver 5.9381004 8.4642479
## cBMIUnder -11.5436658 -1.3957319
## AGE 0.8056299 0.9401465
## MALE -3.5771345 -1.2323013
## BPMEDS 21.7702235 28.2728012
# Generate ANOVA table for model so we can
# determine the p-value for Categorized BMI variable in the model.
# Use car package Anova function
library(car)
# generate ANOVA table for our model
# difference between Type I, Type II, and Type III Sums of Squares
# https://rcompanion.org/rcompanion/d_04.html
Anova(fit5.mlr,type = "II")
## Anova Table (Type II tests)
##
## Response: SYSBP
## Sum Sq Df F value Pr(>F)
## cBMI 115279 3 103.81 < 2.2e-16 ***
## AGE 239647 1 647.39 < 2.2e-16 ***
## MALE 5986 1 16.17 5.889e-05 ***
## BPMEDS 84268 1 227.64 < 2.2e-16 ***
## Residuals 1609157 4347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the resulting p-value is very small and significant
# Check Diagnostics for this model
plot(fit5.mlr)




# Check Measures of Influence
# install and load Olsrr package which offers the following tools to detect influential observations:
# -- Cook's D Chart
# -- DFBETAs Panel
# -- DFFITs Plot
# -- Standardized Residual Chart
# -- Studentized Residuals vs Leverage Plot
# install.packages("olsrr")
library(olsrr)
# Cook's D Chart
ols_plot_cooksd_chart(fit5.mlr)

# DFBETAs Panel
ols_plot_dfbetas(fit5.mlr)


# DFFITs Plot
ols_plot_dffits(fit5.mlr)

# Standardized Residual Chart
ols_plot_resid_stand(fit5.mlr)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(fit5.mlr)

# What if our categorical variable isnt a character
# variable but is instead a factor variable
fram1$fBMI <- factor(fram1$cBMI)
str(fram1)
## 'data.frame': 4434 obs. of 41 variables:
## $ SEX : int 1 2 1 2 2 2 2 2 1 1 ...
## $ RANDID : int 2448 6238 9428 10552 11252 11263 12629 12806 14367 16365 ...
## $ TOTCHOL : int 195 250 245 225 285 228 205 313 260 225 ...
## $ AGE : int 39 46 48 61 46 43 63 45 52 43 ...
## $ SYSBP : num 106 121 128 150 130 ...
## $ DIABP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ CURSMOKE: int 0 0 1 1 1 0 0 1 0 1 ...
## $ CIGPDAY : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE: int 80 95 75 65 85 77 60 79 76 93 ...
## $ GLUCOSE : int 77 76 70 103 85 99 85 78 79 88 ...
## $ PREVCHD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVAP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVSTRK: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVHYP : int 0 0 0 1 0 1 0 0 1 1 ...
## $ TIME : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PERIOD : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HDLC : logi NA NA NA NA NA NA ...
## $ LDLC : logi NA NA NA NA NA NA ...
## $ DEATH : int 0 0 0 1 0 0 0 0 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 1 0 0 0 ...
## $ HOSPMI : int 1 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 0 0 0 0 1 0 0 0 0 ...
## $ ANYCHD : int 1 0 0 0 0 1 1 0 0 0 ...
## $ STROKE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ CVD : int 1 0 0 1 0 1 0 0 0 0 ...
## $ HYPERTEN: int 0 0 0 1 1 1 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 2956 8766 8766 373 8766 8766 8766 ...
## $ TIMEMI : int 6438 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMIFC: int 6438 8766 8766 2956 8766 5719 8766 8766 8766 8766 ...
## $ TIMECHD : int 6438 8766 8766 2956 8766 5719 373 8766 8766 8766 ...
## $ TIMESTRK: int 8766 8766 8766 2089 8766 8766 8766 8766 8766 8766 ...
## $ TIMECVD : int 6438 8766 8766 2089 8766 5719 8766 8766 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 2956 8766 8766 8766 8766 8766 8766 ...
## $ TIMEHYP : int 8766 8766 8766 0 4285 0 2212 8679 0 0 ...
## $ MALE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ cBMI : chr "Over" "Over" "Over" "Over" ...
## $ fBMI : Factor w/ 4 levels "Norm","Obese",..: 3 3 3 3 1 2 2 1 3 1 ...
levels(fram1$fBMI)
## [1] "Norm" "Obese" "Over" "Under"
# lets say we wanted to change the reference level
# of BMI to be OBESE. How would we do that?
# We can use the relevel function with factors
# syntax: new.variable <- relevel(old.variable, ref="c")
# lets call the new variable obese.cBMI
# Then
fram1$obese.fBMI <-relevel(fram1$fBMI,ref="Obese")
levels(fram1$obese.fBMI)
## [1] "Obese" "Norm" "Over" "Under"
levels(fram1$fBMI)
## [1] "Norm" "Obese" "Over" "Under"
# Here is the original factor version
fBMI.fit <- lm(SYSBP ~ fBMI + AGE + MALE + BPMEDS, data = fram1)
summary(fBMI.fit)
##
## Call:
## lm(formula = SYSBP ~ fBMI + AGE + MALE + BPMEDS, data = fram1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## fBMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## fBMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## fBMIUnder -6.46970 2.58809 -2.500 0.0125 *
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Here is the releveled factor version
obese.fBMI.fit <- lm(SYSBP ~ obese.fBMI + AGE + MALE + BPMEDS, data = fram1)
summary(obese.fBMI.fit)
##
## Call:
## lm(formula = SYSBP ~ obese.fBMI + AGE + MALE + BPMEDS, data = fram1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.52879 1.95167 50.997 < 2e-16 ***
## obese.fBMINorm -14.85372 0.92743 -16.016 < 2e-16 ***
## obese.fBMIOver -7.65254 0.92892 -8.238 2.29e-16 ***
## obese.fBMIUnder -21.32341 2.67961 -7.958 2.22e-15 ***
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Nice UCLA tutorial on R contrast coding options
# https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
#
# Lets return to the BRFSS dataset
#
# BRFSS Data:
# The Behavioral Risk Factor Surveillance System (BRFSS)
# is the nation's premier system of health-related telephone
# surveys that collect state data about U.S. residents regarding
# their health-related risk behaviors, chronic health conditions,
# and use of preventive services.
# 2018 Cohort Charcteristics
# Overall cohort size = 437436 observations of 275 variables
# Restricting cohort to NC = 4729 observations
# *** subset(brfss2018,`_STATE`==37) ***
# Restricting cohort to those who drank at least one drink of
# alcohol in last 30 days = 2084 observations
# *** subset(nc.brfss,DRNKANY5==1) ***
# Restricting cohort to low-quantity (non-heavy) alcohol
# drinkers = 1759 observations
# *** subset(l30.nc.brfss,`_RFDRHV6`==1) ***
# We have brfssm1d1 with 1759 observations and have included the following variables:
# 1. BMI (units=kilograms per meter squared)
# 2. DAYS.DRINKING (# of Days on which had at least 1 alcoholic drink in past 30 days)
# 3. AGE (in years, collapsed above 80)
# 4. SLEEP (average # hours sleep in a 24 hour period)
# 5. EXERCISE.YN (Y/N participate in any physical activites/exercise during past month)
# 6. FEMALE (Sex at birth; 0=MALE, 1=FEMALE)
# Download and load the data file = brfssm1d1
download.file("http://www.duke.edu/~sgrambow/crp241data/brfssm1d1.RData",
destfile = "brfssm1d1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("brfssm1d1.RData")
# Lets fit a multiple linear regression model-- bmi.fit2
# fit a multiple linear regression model with
# Y = BMI
# X1 = DAYS.DRINKING
# X2 = AGE
# X3 = SLEEP
# X4 = EXERCISE.YN
# X5 = FEMALE
bmi.fit2 <- lm(BMI ~ DAYS.DRINKING +AGE + SLEEP + EXERCISE.YN + FEMALE, data=brfssm1d1)
summary(bmi.fit2)
##
## Call:
## lm(formula = BMI ~ DAYS.DRINKING + AGE + SLEEP + EXERCISE.YN +
## FEMALE, data = brfssm1d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6206 -3.7051 -0.8026 2.4948 29.0633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.800838 0.903225 27.458 < 2e-16 ***
## DAYS.DRINKING -0.098593 0.019413 -5.079 4.22e-07 ***
## AGE 0.017992 0.008024 2.242 0.0251 *
## SLEEP -0.044673 0.105855 -0.422 0.6731
## EXERCISE.YN 2.814573 0.390835 7.201 8.95e-13 ***
## FEMALE -0.497176 0.280881 -1.770 0.0769 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.664 on 1683 degrees of freedom
## (70 observations deleted due to missingness)
## Multiple R-squared: 0.04963, Adjusted R-squared: 0.04681
## F-statistic: 17.58 on 5 and 1683 DF, p-value: < 2.2e-16
# Generate standard diagnostic plots
plot(bmi.fit2)




# Additional Diagnostics focused
# on measures of influence
library(olsrr)
# Cook's D Chart
ols_plot_cooksd_chart(bmi.fit2)

# DFBETAs Panel
ols_plot_dfbetas(bmi.fit2)


# DFFITs Plot
ols_plot_dffits(bmi.fit2)

# Standardized Residual Chart
ols_plot_resid_stand(bmi.fit2)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(bmi.fit2)

#
# End of Program