# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 5 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~

# Prevention of REnal and Vascular END-stage Disease (PREVEND) study

# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the 
# Ruff Figural Fluency Test (RFFT) measure of cognitive function) 
# in a cross-sectional cohort of 4,095 participants from the 
# Netherlands who were part of the larger PREVEND study. 
# RFFT Scores range from 0 (worst) to 175 (best). 
# The dataset we will work with includes a random sample of 500 
# participants from the cohort. 

# Data Dictionary

# Variables
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Age        Age in years (Integer) 
# Statin     Statin use ("No","Yes") (Factor Variable) 
# Sex        Sex at birth ("Male","Female") (Factor Variable) 
# Educ       Education (integer: 0=primary school; 1=Lower secondary school;
#                       2=Higher Secondary School; 3=University)
# Education  ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable) 
# CVD        1=CVD present; 0=CVD absent (Factor Variable)
# SBP        Systolic Blood Pressure (Numeric)
# DBP        Diastolic Blood Pressure (Numeric)
# Smoking    Smoking Status ("No","Yes") (Factor Variable) 
# Chol       Total Cholesterol (mmol/L) (Numeric)
# HDL        HDL Cholesterol (mmol/L) (Numeric)
# DM         Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR       EGFR (mL/min/1.73m2) (Numeric)

# Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))

# if the above load command doesnt run for you try the
# set of commands below to load the data frame
# highlight and run all 3 lines of code
download.file("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData",
              destfile = "prevend.final2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("prevend.final2.RData")
# Start of Descriptive Analyses

# examine structure of the data 
str(prevend.final2)
## 'data.frame':    500 obs. of  14 variables:
##  $ Age      : int  55 65 46 68 70 53 64 70 46 44 ...
##  $ Sex      : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
##  $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
##  $ RFFT     : int  62 79 89 70 35 14 31 47 88 91 ...
##  $ CVD      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DM       : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ Smoking  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SBP      : num  122 108 120 114 114 ...
##  $ DBP      : num  63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
##  $ eGFR     : num  83.3 76.5 76.4 61.2 88.1 ...
##  $ Chol     : num  3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
##  $ HDL      : num  1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
##  $ Statin   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Educ     : int  2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status.  These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend.final2,Statin=="Yes")  
no.statin <- subset(prevend.final2,Statin=="No")  

# lets use a boxplot to visualize the data and assess structural 
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
        main="RFFT by Statin Use",
        ylab="RFFT",
        xlab="Statin Use",
        names=c("Statin User","Not Using Statin"),
        col="light blue")

# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="red")

# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   44.00   57.00   60.66   73.00  134.00
summary(no.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   47.00   71.00   70.71   91.00  136.00
# and separately calculate SD for each group since 
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# Start of Regresssion Analyses

# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)

# visualize data (RFFT vs Statin) using ggplot boxplot
# install.
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
  geom_boxplot()+
  geom_jitter(shape=16, position=position_jitter(0.1))

# Visualize relationship between RFFT (cognitive function) and statin use
# using a scatter plot and regression line
# to get this plot in ggplot, we need to modify Statin 
# which is a factor so create new integer 0 vs 1 variable
# 0 - Statin Use No; 1 - Statin Use Yes
prevend.final2$Statin.01 <- as.integer(prevend.final2$Statin)-1
#check recording
table(prevend.final2$Statin.01,prevend.final2$Statin)
##    
##      No Yes
##   0 385   0
##   1   0 115
# now plot
ggplot(prevend.final2, aes(x=Statin.01, y=RFFT))+
  geom_point() +
  geom_smooth(method=lm,formula= y~x,se=TRUE)

# Fit Simple Linear Regression: RFFT ~ Statin
RFFT.statin.model  <- lm(RFFT ~ Statin, data = prevend.final2)
summary(RFFT.statin.model)
## 
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.714 -22.714   0.286  18.299  73.339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   70.714      1.381  51.212  < 2e-16 ***
## StatinYes    -10.053      2.879  -3.492 0.000523 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.09 on 498 degrees of freedom
## Multiple R-squared:  0.0239, Adjusted R-squared:  0.02194 
## F-statistic: 12.19 on 1 and 498 DF,  p-value: 0.0005226
# Check Regression Diagnostics
plot(RFFT.statin.model)

# Predicting mean values from model

# Approach 1 for calculating predicted mean value for Statin Yes group
# calculating predicted values manually from model 
# using slope and intercept
# from model we know slope is for statin use YES vs NO
predicted.meanRFFT.StatinYES.approach1 <- 70.714 - 10.053
predicted.meanRFFT.StatinYES.approach1
## [1] 60.661
# Approach 2 for calculating predicted mean value for Statin Yes group
# we are just talking about mean in Statin YES group
# we can get this using the yes.statin data frame
# simply using the mean() function
predicted.meanRFFT.StatinYES.approach2 <- mean(yes.statin$RFFT)
predicted.meanRFFT.StatinYES.approach2
## [1] 60.66087
# similarly, we can get the mean for statinNO
predicted.meanRFFT.StatinNO.approach2 <- mean(no.statin$RFFT)
predicted.meanRFFT.StatinNO.approach2
## [1] 70.71429
# Approach 3
# we can use the predict function and our regression
# model to generate predictions
# using predict function also IMPORTANTLY gives confidence limits
# recall Statin is a factor with levels "Yes" and "No"
# and we must specify the level of interest for our prediction
# so predicted mean for Statin = "Yes"
predict(RFFT.statin.model,newdata=data.frame(Statin="Yes"),interval='conf')
##        fit      lwr      upr
## 1 60.66087 55.69699 65.62474
# and predicted mean for Statin = "No"
predict(RFFT.statin.model,newdata=data.frame(Statin="No"),interval='conf')
##        fit      lwr      upr
## 1 70.71429 68.00135 73.42722
# Regression analyses to examine confounding

# visualize relationship between Statin use and age using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=Age, fill=Statin))+
  geom_boxplot()+
  geom_jitter(shape=16, position=position_jitter(0.1))

# Visualize relationship between RFFT (cognitive function) and age?
# visualize data (RFFT vs Age) using ggplot scatter plot
ggplot(prevend.final2, aes(x=Age, y=RFFT))+
  geom_point() +
  geom_smooth(method=lm,formula= y~x,se=TRUE)

# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use
RFFT.statin.confound.age.model  <- lm(RFFT ~ Statin + Age, data = prevend.final2)
summary(RFFT.statin.confound.age.model)
## 
## Call:
## lm(formula = RFFT ~ Statin + Age, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.855 -16.860  -1.178  15.730  58.751 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 137.8822     5.1221  26.919   <2e-16 ***
## StatinYes     0.8509     2.5957   0.328    0.743    
## Age          -1.2710     0.0943 -13.478   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.21 on 497 degrees of freedom
## Multiple R-squared:  0.2852, Adjusted R-squared:  0.2823 
## F-statistic: 99.13 on 2 and 497 DF,  p-value: < 2.2e-16
# Check Regression Diagnostics
plot(RFFT.statin.confound.age.model)

# Regression analyses to examine interaction

RFFT.statin.interaction.sex.model <- lm(RFFT ~ Statin + Sex + Statin*Sex,
                                   data=prevend.final2)
summary(RFFT.statin.interaction.sex.model)  
## 
## Call:
## lm(formula = RFFT ~ Statin + Sex + Statin * Sex, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.474 -22.474  -0.107  18.526  73.740 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          70.4740     1.9591  35.973  < 2e-16 ***
## StatinYes           -10.2137     3.7327  -2.736  0.00644 ** 
## SexFemale             0.4794     2.7670   0.173  0.86252    
## StatinYes:SexFemale   0.6175     5.9411   0.104  0.91727    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.15 on 496 degrees of freedom
## Multiple R-squared:  0.02404,    Adjusted R-squared:  0.01814 
## F-statistic: 4.073 on 3 and 496 DF,  p-value: 0.007105
# Check Regression Diagnostics
plot(RFFT.statin.interaction.sex.model)

# End Of File







# End of Program