#
# CRP245 Spring 2021 Final Exam
#
# R Code for Question 4-7
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study
# Variables
# RFFT Ruff Figural Fluency Test (RFFT) measure of cognitive function
# Age Age in years
# Statin 1=Statin User; 0=otherwise
# Sex sex at birth (0=males; 1=females)
# Educ Education (numeric: 0=primary school; 1=Lower secondary school;
# 2=Higher Secondary School; 3=University)
# Education Factor Variable (Primary; LowerSecond; HigherSecond; Univ)
# CVD 1=CVD present; 0=CVD absent
# SBP Systolic Blood Pressure
# DBP Diastolic Blood Pressure
# BMI Body Mass Index (kg/m2)
# Smoking 1=smoker; 0=non-smoker
# Chol Total Cholesterol (mmol/L)
# HDL HDL Cholesterol (mmol/L)
# DM 1=Type 2 Diabetes YES; 0=No Type 2 Diab
# eGFR EGFR (mL/min/1.73m2)
## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))
# structure of the data
str(prevend.final2)
## 'data.frame': 500 obs. of 14 variables:
## $ Age : int 55 65 46 68 70 53 64 70 46 44 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
## $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
## $ RFFT : int 62 79 89 70 35 14 31 47 88 91 ...
## $ CVD : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DM : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SBP : num 122 108 120 114 114 ...
## $ DBP : num 63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
## $ eGFR : num 83.3 76.5 76.4 61.2 88.1 ...
## $ Chol : num 3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
## $ HDL : num 1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
## $ Statin : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ Educ : int 2 1 3 2 2 0 1 3 2 3 ...
# summary of overall data -- any missing values?
summary(prevend.final2)
## Age Sex Education RFFT CVD
## Min. :36.00 Male :265 Primary : 51 Min. : 11.0 No :460
## 1st Qu.:46.00 Female:235 LowerSecond :157 1st Qu.: 46.0 Yes: 40
## Median :54.00 HigherSecond:134 Median : 67.0
## Mean :54.82 Univ :158 Mean : 68.4
## 3rd Qu.:64.00 3rd Qu.: 88.0
## Max. :81.00 Max. :136.0
## DM Smoking SBP DBP eGFR
## No :467 No :384 Min. : 77.5 Min. : 48.00 Min. : 38.48
## Yes: 33 Yes:116 1st Qu.:112.0 1st Qu.: 66.50 1st Qu.: 68.54
## Median :123.0 Median : 72.00 Median : 76.36
## Mean :125.8 Mean : 73.05 Mean : 77.73
## 3rd Qu.:137.0 3rd Qu.: 78.62 3rd Qu.: 87.32
## Max. :188.5 Max. :101.50 Max. :131.50
## Chol HDL Statin Educ
## Min. :2.230 Min. :-1.000 No :385 Min. :0.000
## 1st Qu.:4.647 1st Qu.: 1.150 Yes:115 1st Qu.:1.000
## Median :5.335 Median : 1.380 Median :2.000
## Mean :5.382 Mean : 1.405 Mean :1.798
## 3rd Qu.:6.053 3rd Qu.: 1.643 3rd Qu.:3.000
## Max. :9.220 Max. : 2.810 Max. :3.000
# summary of outcome = RFFT
# numerical descriptives
summary(prevend.final2$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.0 46.0 67.0 68.4 88.0 136.0
sd(prevend.final2$RFFT)
## [1] 27.39565
# KEY POINT
# mean of RFFT: 68.1
# median of RFFT: 66.0
# stand dev of RFFT: 27.50
# graphical descriptives
# boxplot
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(prevend.final2$RFFT,
main="Boxplot of RFFT showing variability in Outcome",
range=0, # this suppresses outliers since we are overlaying points
ylab = "RFFT")
# add individual points to boxplot to show spread
# using stripchart function
stripchart(prevend.final2$RFFT, vertical=TRUE, add=TRUE, method="jitter",
jitter=0.03,col='blue',cex=1.1, pch=20)
# add mean to boxplot with points function
points(mean(prevend.final2$RFFT),cex=2,pch=16,col="red")
# add a legend
legend("topright", c("Data Points", "Sample Mean","Sample Median"),
cex=0.8,lty=c(1,1,1),lwd=c(3,3,3),
col = c("blue","red","black",bty ="n"))
# add text with mean Y and Variance Y
text(1.16,80,"Mean =",adj=c(-1,0),col="red")
text(1.26,80,round(mean(prevend.final2$RFFT),2),adj=c(-1,0),col="red")
text(1.2,70,"SD =",adj=c(-1,0),col="blue")
text(1.26,70,round(sd(prevend.final2$RFFT),2),adj=c(-1,0),col="blue")
# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)

# What is the relationship between RFFT (cognitive function) and
# education level?
#
#create histogram: RFFT by Education
ggplot(prevend.final2, aes(x=Education, y=RFFT, fill=Education))+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.1))

# NOTE: Model below uses numeric version of Education variable (Educ)
# MODEL 1 -- EDUC.SIMP.MODEL
# Simple Linear Regression: RFFT ~ Educ
educ.simp.model <- lm(RFFT ~ Educ, data = prevend.final2)
summary(educ.simp.model)
##
## Call:
## lm(formula = RFFT ~ Educ, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.622 -16.148 -0.885 15.536 62.694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.148 2.104 19.55 <2e-16 ***
## Educ 15.158 1.023 14.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.85 on 498 degrees of freedom
## Multiple R-squared: 0.3059, Adjusted R-squared: 0.3045
## F-statistic: 219.5 on 1 and 498 DF, p-value: < 2.2e-16
# Check Regression Diagnostics
plot(educ.simp.model)




# NOTE: Model below uses Factor version of Education variable (Education)
# Simple Linear Regression: RFFT ~ Education
education.simp.model <- lm(RFFT ~ Education, data = prevend.final2)
summary(education.simp.model)
##
## Call:
## lm(formula = RFFT ~ Education, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.905 -15.975 -0.905 16.068 63.280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.941 3.203 12.783 < 2e-16 ***
## EducationLowerSecond 14.779 3.686 4.009 7.04e-05 ***
## EducationHigherSecond 32.133 3.763 8.539 < 2e-16 ***
## EducationUniv 44.964 3.684 12.207 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.87 on 496 degrees of freedom
## Multiple R-squared: 0.3072, Adjusted R-squared: 0.303
## F-statistic: 73.3 on 3 and 496 DF, p-value: < 2.2e-16
# Check Regression Diagnostics
plot(education.simp.model)




#
# What is the relationship between RFFT (cognitive function) and statin use?
#
# visualize data (RFFT vs Statin) using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.1))

# MODEL 2 - STATIN.SIMP.MODEL
# Simple Linear Regression: RFFT ~ Statin
statin.simp.model <- lm(RFFT ~ Statin, data = prevend.final2)
summary(statin.simp.model)
##
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.714 -22.714 0.286 18.299 73.339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.714 1.381 51.212 < 2e-16 ***
## StatinYes -10.053 2.879 -3.492 0.000523 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.09 on 498 degrees of freedom
## Multiple R-squared: 0.0239, Adjusted R-squared: 0.02194
## F-statistic: 12.19 on 1 and 498 DF, p-value: 0.0005226
# Check Regression Diagnostics
plot(statin.simp.model)




# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use
# Visualizing relationships: RFFT by age with Statin Use
ggplot(prevend.final2, aes(x=Age, y=RFFT, shape=Statin,
color=Statin))+
geom_point()

# MODEL 3 - STATIN.AGE.MODEL
statin.age.model <- lm(RFFT ~ Statin + Age, data = prevend.final2)
summary(statin.age.model)
##
## Call:
## lm(formula = RFFT ~ Statin + Age, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.855 -16.860 -1.178 15.730 58.751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.8822 5.1221 26.919 <2e-16 ***
## StatinYes 0.8509 2.5957 0.328 0.743
## Age -1.2710 0.0943 -13.478 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.21 on 497 degrees of freedom
## Multiple R-squared: 0.2852, Adjusted R-squared: 0.2823
## F-statistic: 99.13 on 2 and 497 DF, p-value: < 2.2e-16
# Check Regression Diagnostics
plot(statin.age.model)




#
# What is the relationship between RFFT (cognitive function) and age?
#
# visualize data (RFFT vs Age) using ggplot scatter plot
ggplot(prevend.final2, aes(x=Age, y=RFFT))+
geom_point() +
geom_smooth(method=lm,formula= y~x,se=TRUE)

# Simple Linear Regression: RFFT ~ Age
age.simp.model <- lm(RFFT ~ Age, data = prevend.final2)
summary(age.simp.model)
##
## Call:
## lm(formula = RFFT ~ Age, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.879 -16.845 -1.095 15.524 58.564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.54972 5.01614 27.42 <2e-16 ***
## Age -1.26136 0.08953 -14.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.19 on 498 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2836
## F-statistic: 198.5 on 1 and 498 DF, p-value: < 2.2e-16
# Check Regression Diagnostics
plot(age.simp.model)




# MODEL 4 - INTERACTION.AGE.STATIN.MODEL
# INTERACTION MODEL FOR RFFT AGE AND STATIN USE
#
interaction.age.statin.model <- lm(RFFT ~ Statin + Age + Statin*Age,
data=prevend.final2)
summary(interaction.age.statin.model)
##
## Call:
## lm(formula = RFFT ~ Statin + Age + Statin * Age, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.551 -16.963 -1.179 15.764 58.802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 140.2031 5.6209 24.943 <2e-16 ***
## StatinYes -13.9720 15.0113 -0.931 0.352
## Age -1.3149 0.1040 -12.646 <2e-16 ***
## StatinYes:Age 0.2474 0.2468 1.003 0.317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.21 on 496 degrees of freedom
## Multiple R-squared: 0.2866, Adjusted R-squared: 0.2823
## F-statistic: 66.42 on 3 and 496 DF, p-value: < 2.2e-16
# MODEL 5 - STATIN.MULT.MODEL
# Running Multivariable Models to Account for Potential Confounding
# Consideration of multiple confounders: Age, Education, CVD status,
statin.mult.model <- lm(RFFT ~ Statin + Age + Education + CVD,
data=prevend.final2)
summary(statin.mult.model)
##
## Call:
## lm(formula = RFFT ~ Statin + Age + Education + CVD, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.348 -15.586 -0.136 13.795 63.935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.03507 6.33012 15.645 < 2e-16 ***
## StatinYes 4.69045 2.44802 1.916 0.05594 .
## Age -0.92029 0.09041 -10.179 < 2e-16 ***
## EducationLowerSecond 10.08831 3.37556 2.989 0.00294 **
## EducationHigherSecond 21.30146 3.57768 5.954 4.98e-09 ***
## EducationUniv 33.12464 3.54710 9.339 < 2e-16 ***
## CVDYes -7.56655 3.65164 -2.072 0.03878 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.71 on 493 degrees of freedom
## Multiple R-squared: 0.4355, Adjusted R-squared: 0.4286
## F-statistic: 63.38 on 6 and 493 DF, p-value: < 2.2e-16
# check model diagnostics
par(mfrow=c(2,2))
plot(statin.mult.model)

par(mfrow=c(1,1))
# check variance inflation factors
library(car)
## Loading required package: carData
vif(statin.mult.model)
## GVIF Df GVIF^(1/(2*Df))
## Statin 1.237387 1 1.112379
## Age 1.278789 1 1.130835
## Education 1.201504 3 1.031069
## CVD 1.144221 1 1.069683