# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 5 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study
# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the
# Ruff Figural Fluency Test (RFFT) measure of cognitive function)
# in a cross-sectional cohort of 4,095 participants from the
# Netherlands who were part of the larger PREVEND study.
# RFFT Scores range from 0 (worst) to 175 (best).
# The dataset we will work with includes a random sample of 500
# participants from the cohort.
# Data Dictionary
# Variables
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Age Age in years (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
# Sex Sex at birth ("Male","Female") (Factor Variable)
# Educ Education (integer: 0=primary school; 1=Lower secondary school;
# 2=Higher Secondary School; 3=University)
# Education ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable)
# CVD 1=CVD present; 0=CVD absent (Factor Variable)
# SBP Systolic Blood Pressure (Numeric)
# DBP Diastolic Blood Pressure (Numeric)
# Smoking Smoking Status ("No","Yes") (Factor Variable)
# Chol Total Cholesterol (mmol/L) (Numeric)
# HDL HDL Cholesterol (mmol/L) (Numeric)
# DM Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR EGFR (mL/min/1.73m2) (Numeric)
# Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))
# if the above load command doesnt run for you try the
# set of commands below to load the data frame
# highlight and run all 3 lines of code
download.file("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData",
destfile = "prevend.final2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("prevend.final2.RData")
# Start of Descriptive Analyses
# examine structure of the data
str(prevend.final2)
## 'data.frame': 500 obs. of 14 variables:
## $ Age : int 55 65 46 68 70 53 64 70 46 44 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
## $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
## $ RFFT : int 62 79 89 70 35 14 31 47 88 91 ...
## $ CVD : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DM : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SBP : num 122 108 120 114 114 ...
## $ DBP : num 63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
## $ eGFR : num 83.3 76.5 76.4 61.2 88.1 ...
## $ Chol : num 3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
## $ HDL : num 1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
## $ Statin : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ Educ : int 2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status. These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend.final2,Statin=="Yes")
no.statin <- subset(prevend.final2,Statin=="No")
# lets use a boxplot to visualize the data and assess structural
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
main="RFFT by Statin Use",
ylab="RFFT",
xlab="Statin Use",
names=c("Statin User","Not Using Statin"),
col="light blue")
# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="red")

# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.00 57.00 60.66 73.00 134.00
summary(no.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 47.00 71.00 70.71 91.00 136.00
# and separately calculate SD for each group since
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# Start of Regresssion Analyses
# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)
# visualize data (RFFT vs Statin) using ggplot boxplot
# install.
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.1))

# Visualize relationship between RFFT (cognitive function) and statin use
# using a scatter plot and regression line
# to get this plot in ggplot, we need to modify Statin
# which is a factor so create new integer 0 vs 1 variable
# 0 - Statin Use No; 1 - Statin Use Yes
prevend.final2$Statin.01 <- as.integer(prevend.final2$Statin)-1
#check recording
table(prevend.final2$Statin.01,prevend.final2$Statin)
##
## No Yes
## 0 385 0
## 1 0 115
# now plot
ggplot(prevend.final2, aes(x=Statin.01, y=RFFT))+
geom_point() +
geom_smooth(method=lm,formula= y~x,se=TRUE)

# Fit Simple Linear Regression: RFFT ~ Statin
RFFT.statin.model <- lm(RFFT ~ Statin, data = prevend.final2)
summary(RFFT.statin.model)
##
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.714 -22.714 0.286 18.299 73.339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.714 1.381 51.212 < 2e-16 ***
## StatinYes -10.053 2.879 -3.492 0.000523 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.09 on 498 degrees of freedom
## Multiple R-squared: 0.0239, Adjusted R-squared: 0.02194
## F-statistic: 12.19 on 1 and 498 DF, p-value: 0.0005226
# Check Regression Diagnostics
plot(RFFT.statin.model)




# Predicting mean values from model
# Approach 1 for calculating predicted mean value for Statin Yes group
# calculating predicted values manually from model
# using slope and intercept
# from model we know slope is for statin use YES vs NO
predicted.meanRFFT.StatinYES.approach1 <- 70.714 - 10.053
predicted.meanRFFT.StatinYES.approach1
## [1] 60.661
# Approach 2 for calculating predicted mean value for Statin Yes group
# we are just talking about mean in Statin YES group
# we can get this using the yes.statin data frame
# simply using the mean() function
predicted.meanRFFT.StatinYES.approach2 <- mean(yes.statin$RFFT)
predicted.meanRFFT.StatinYES.approach2
## [1] 60.66087
# similarly, we can get the mean for statinNO
predicted.meanRFFT.StatinNO.approach2 <- mean(no.statin$RFFT)
predicted.meanRFFT.StatinNO.approach2
## [1] 70.71429
# Approach 3
# we can use the predict function and our regression
# model to generate predictions
# using predict function also IMPORTANTLY gives confidence limits
# recall Statin is a factor with levels "Yes" and "No"
# and we must specify the level of interest for our prediction
# so predicted mean for Statin = "Yes"
predict(RFFT.statin.model,newdata=data.frame(Statin="Yes"),interval='conf')
## fit lwr upr
## 1 60.66087 55.69699 65.62474
# and predicted mean for Statin = "No"
predict(RFFT.statin.model,newdata=data.frame(Statin="No"),interval='conf')
## fit lwr upr
## 1 70.71429 68.00135 73.42722
# Regression analyses to examine confounding
# visualize relationship between Statin use and age using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=Age, fill=Statin))+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.1))

# Visualize relationship between RFFT (cognitive function) and age?
# visualize data (RFFT vs Age) using ggplot scatter plot
ggplot(prevend.final2, aes(x=Age, y=RFFT))+
geom_point() +
geom_smooth(method=lm,formula= y~x,se=TRUE)

# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use
RFFT.statin.confound.age.model <- lm(RFFT ~ Statin + Age, data = prevend.final2)
summary(RFFT.statin.confound.age.model)
##
## Call:
## lm(formula = RFFT ~ Statin + Age, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.855 -16.860 -1.178 15.730 58.751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.8822 5.1221 26.919 <2e-16 ***
## StatinYes 0.8509 2.5957 0.328 0.743
## Age -1.2710 0.0943 -13.478 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.21 on 497 degrees of freedom
## Multiple R-squared: 0.2852, Adjusted R-squared: 0.2823
## F-statistic: 99.13 on 2 and 497 DF, p-value: < 2.2e-16
# Check Regression Diagnostics
plot(RFFT.statin.confound.age.model)




# Regression analyses to examine interaction
RFFT.statin.interaction.sex.model <- lm(RFFT ~ Statin + Sex + Statin*Sex,
data=prevend.final2)
summary(RFFT.statin.interaction.sex.model)
##
## Call:
## lm(formula = RFFT ~ Statin + Sex + Statin * Sex, data = prevend.final2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.474 -22.474 -0.107 18.526 73.740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.4740 1.9591 35.973 < 2e-16 ***
## StatinYes -10.2137 3.7327 -2.736 0.00644 **
## SexFemale 0.4794 2.7670 0.173 0.86252
## StatinYes:SexFemale 0.6175 5.9411 0.104 0.91727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.15 on 496 degrees of freedom
## Multiple R-squared: 0.02404, Adjusted R-squared: 0.01814
## F-statistic: 4.073 on 3 and 496 DF, p-value: 0.007105
# Check Regression Diagnostics
plot(RFFT.statin.interaction.sex.model)




# End Of File
# End of Program