#
# CRP245 Spring 2022 Final Exam
#
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study
# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the Ruff
# Figural Fluency Test (RFFT) measure of cognitive function) in a
# cross-sectional cohort of 4,095 participants from the Netherlands
# who were part of the larger PREVEND study. RFFT Scores range from
# 0 (worst) to 175 (best). The analyses presented here are based on
# a random sample of 413 participants from the cohort.
# Dataset Description
# Variables
# RFFT Ruff Figural Fluency Test (RFFT) measure of cognitive function
# Age Age in years
# Statin 1=Statin User; 0=otherwise
# Sex sex at birth (0=males; 1=females)
# Educ Education (numeric: 0=primary school; 1=Lower secondary school;
# 2=Higher Secondary School; 3=University)
# Education Factor Variable (Primary; LowerSecond; HigherSecond; Univ)
# CVD 1=CVD present; 0=CVD absent
# SBP Systolic Blood Pressure
# DBP Diastolic Blood Pressure
# Smoking 1=smoker; 0=non-smoker
# Chol Total Cholesterol (mmol/L)
# HDL HDL Cholesterol (mmol/L)
# DM 1=Type 2 Diabetes YES; 0=No Type 2 Diab
# eGFR EGFR (mL/min/1.73m2)
## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/sp.22.prevend.final2.RData"))
# structure of the data
str(prevend.final2)
## 'data.frame': 413 obs. of 15 variables:
## $ Age : int 46 72 65 63 61 40 62 49 38 59 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 2 2 1 2 2 1 2 1 ...
## $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 4 3 2 2 3 3 2 2 4 3 ...
## $ RFFT : int 46 60 87 35 44 104 48 54 125 51 ...
## $ CVD : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 2 ...
## $ DM : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ SBP : num 109 159 110 136 126 ...
## $ DBP : num 69.5 78 76 73 79 67.5 71.5 79 66.5 71.5 ...
## $ eGFR : num 72.9 74.3 72.8 94.1 74.9 ...
## $ Chol : num 5.52 5.15 6.57 6.36 5.96 5.26 3.65 4.16 4.29 2.23 ...
## $ BMI : num 24.7 31.3 21.7 34.3 23.5 ...
## $ HDL : num 1.36 1.04 2.08 1.92 1.21 1.38 1.15 0.86 2.02 0.85 ...
## $ Statin : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 1 2 ...
## $ Educ : int 3 2 1 1 2 2 1 1 3 2 ...
# summary of overall data -- any missing values?
summary(prevend.final2)
## Age Sex Education RFFT CVD
## Min. :36.00 Male :211 Primary : 48 Min. : 11.00 No :382
## 1st Qu.:46.00 Female:202 LowerSecond :130 1st Qu.: 44.00 Yes: 31
## Median :53.00 HigherSecond:111 Median : 64.00
## Mean :54.95 Univ :124 Mean : 66.58
## 3rd Qu.:64.00 3rd Qu.: 85.00
## Max. :81.00 Max. :136.00
## DM Smoking SBP DBP eGFR
## No :390 No :314 Min. : 77.5 Min. : 48.00 Min. : 41.24
## Yes: 23 Yes: 99 1st Qu.:110.5 1st Qu.: 66.00 1st Qu.: 68.94
## Median :122.0 Median : 71.50 Median : 75.69
## Mean :124.1 Mean : 72.42 Mean : 78.03
## 3rd Qu.:133.0 3rd Qu.: 78.00 3rd Qu.: 87.36
## Max. :188.5 Max. :100.50 Max. :123.47
## Chol BMI HDL Statin Educ
## Min. :2.230 Min. :18.34 Min. :-1.000 No :317 Min. :0.000
## 1st Qu.:4.610 1st Qu.:23.74 1st Qu.: 1.160 Yes: 96 1st Qu.:1.000
## Median :5.310 Median :25.69 Median : 1.400 Median :2.000
## Mean :5.356 Mean :26.57 Mean : 1.417 Mean :1.753
## 3rd Qu.:5.970 3rd Qu.:28.56 3rd Qu.: 1.670 3rd Qu.:3.000
## Max. :9.220 Max. :60.95 Max. : 2.810 Max. :3.000
# summary of outcome = RFFT
# numerical descriptives
summary(prevend.final2$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.00 64.00 66.58 85.00 136.00
sd(prevend.final2$RFFT)
## [1] 27.27229
# graphical descriptives
# boxplot
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(prevend.final2$RFFT,
main="Boxplot of RFFT showing variability in Outcome",
range=0, # this suppresses outliers since we are overlaying points
ylab = "RFFT")
# add individual points to boxplot to show spread
# using stripchart function
stripchart(prevend.final2$RFFT, vertical=TRUE, add=TRUE, method="jitter",
jitter=0.03,col='blue',cex=1.1, pch=20)
# add mean to boxplot with points function
points(mean(prevend.final2$RFFT),cex=2,pch=16,col="red")
# add a legend
legend("topright", c("Data Points", "Sample Mean","Sample Median"),
cex=0.8,lty=c(1,1,1),lwd=c(3,3,3),
col = c("blue","red","black",bty ="n"))
# add text with mean Y and Variance Y
text(1.16,80,"Mean =",adj=c(-1,0),col="red")
text(1.26,80,round(mean(prevend.final2$RFFT),2),adj=c(-1,0),col="red")
text(1.2,70,"SD =",adj=c(-1,0),col="blue")
text(1.26,70,round(sd(prevend.final2$RFFT),2),adj=c(-1,0),col="blue")
# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)

# What is the relationship between RFFT (cognitive function) and statin use?
# visualize data (RFFT vs Statin) using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.1))

# MODEL 1 - examines the association between cognitive ability
# and statin use alone
# Simple Linear Regression: RFFT ~ Statin
model1 <- lm(RFFT ~ Statin, data = prevend.final2)
summary(model1)
##
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.290 -22.290 0.365 17.710 66.710
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.290 1.508 45.937 < 2e-16 ***
## StatinYes -11.655 3.129 -3.725 0.000222 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.86 on 411 degrees of freedom
## Multiple R-squared: 0.03266, Adjusted R-squared: 0.03031
## F-statistic: 13.88 on 1 and 411 DF, p-value: 0.0002224
# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use
# Model 2 examines the association between cognitive ability
# and statin use adjusted for potential confounders including
# age, educational level and presence of cardiovascular disease.
model2 <- lm(RFFT ~ Statin + Age + Education + CVD,
data=prevend.final2)
summary(model2)
##
## Call:
## lm(formula = RFFT ~ Statin + Age + Education + CVD, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.588 -16.136 -0.937 12.769 64.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.1409 6.9534 14.258 < 2e-16 ***
## StatinYes 4.0501 2.8222 1.435 0.15203
## Age -0.8959 0.1028 -8.718 < 2e-16 ***
## EducationLowerSecond 8.0293 3.6325 2.210 0.02764 *
## EducationHigherSecond 19.0765 3.8640 4.937 1.16e-06 ***
## EducationUniv 30.2396 3.7876 7.984 1.47e-14 ***
## CVDYes -13.3537 4.2858 -3.116 0.00196 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.3 on 406 degrees of freedom
## Multiple R-squared: 0.3991, Adjusted R-squared: 0.3902
## F-statistic: 44.95 on 6 and 406 DF, p-value: < 2.2e-16
# check model diagnostics
plot(model2)




# check variance inflation factors
library(car)
## Loading required package: carData
vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## Statin 1.294016 1 1.137548
## Age 1.264724 1 1.124599
## Education 1.162706 3 1.025443
## CVD 1.161296 1 1.077634