#
# Assignment 4 CRP245 Spring 2023
#
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study
# A study by Joosten, et al. examined the association of statin use 
# and other variables with cognitive ability (as measured by the Ruff 
# Figural Fluency Test (RFFT) measure of cognitive function) in a 
# cross-sectional cohort of 4,095 participants from the Netherlands 
# who were part of the larger PREVEND study. RFFT Scores range from 
# 0 (worst) to 175 (best). The analyses presented here are based on 
# a random sample of 413 participants from the cohort. 

# Dataset Description
# Variables
# RFFT       Ruff Figural Fluency Test (RFFT) measure of cognitive function         
# Age        Age in years
# Statin     1=Statin User; 0=otherwise
# Sex        sex at birth (0=males; 1=females)
# Educ       Education (numeric: 0=primary school; 1=Lower secondary school;
#                       2=Higher Secondary School; 3=University)
# Education  Factor Variable (Primary; LowerSecond; HigherSecond; Univ)
# CVD        1=CVD present; 0=CVD absent
# SBP        Systolic Blood Pressure
# DBP        Diastolic Blood Pressure
# Smoking    1=smoker; 0=non-smoker
# Chol       Total Cholesterol (mmol/L)
# HDL        HDL Cholesterol (mmol/L)
# DM         1=Type 2 Diabetes YES; 0=No Type 2 Diab
# eGFR       EGFR (mL/min/1.73m2)

## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/sp.22.prevend.final2.RData"))

# structure of the data 
str(prevend.final2)
## 'data.frame':    413 obs. of  15 variables:
##  $ Age      : int  46 72 65 63 61 40 62 49 38 59 ...
##  $ Sex      : Factor w/ 2 levels "Male","Female": 1 1 2 2 1 2 2 1 2 1 ...
##  $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 4 3 2 2 3 3 2 2 4 3 ...
##  $ RFFT     : int  46 60 87 35 44 104 48 54 125 51 ...
##  $ CVD      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 2 ...
##  $ DM       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Smoking  : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ SBP      : num  109 159 110 136 126 ...
##  $ DBP      : num  69.5 78 76 73 79 67.5 71.5 79 66.5 71.5 ...
##  $ eGFR     : num  72.9 74.3 72.8 94.1 74.9 ...
##  $ Chol     : num  5.52 5.15 6.57 6.36 5.96 5.26 3.65 4.16 4.29 2.23 ...
##  $ BMI      : num  24.7 31.3 21.7 34.3 23.5 ...
##  $ HDL      : num  1.36 1.04 2.08 1.92 1.21 1.38 1.15 0.86 2.02 0.85 ...
##  $ Statin   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 2 1 2 ...
##  $ Educ     : int  3 2 1 1 2 2 1 1 3 2 ...
# summary of overall data -- any missing values?
summary(prevend.final2)
##       Age            Sex             Education        RFFT         CVD     
##  Min.   :36.00   Male  :211   Primary     : 48   Min.   : 11.00   No :382  
##  1st Qu.:46.00   Female:202   LowerSecond :130   1st Qu.: 44.00   Yes: 31  
##  Median :53.00                HigherSecond:111   Median : 64.00            
##  Mean   :54.95                Univ        :124   Mean   : 66.58            
##  3rd Qu.:64.00                                   3rd Qu.: 85.00            
##  Max.   :81.00                                   Max.   :136.00            
##    DM      Smoking        SBP             DBP              eGFR       
##  No :390   No :314   Min.   : 77.5   Min.   : 48.00   Min.   : 41.24  
##  Yes: 23   Yes: 99   1st Qu.:110.5   1st Qu.: 66.00   1st Qu.: 68.94  
##                      Median :122.0   Median : 71.50   Median : 75.69  
##                      Mean   :124.1   Mean   : 72.42   Mean   : 78.03  
##                      3rd Qu.:133.0   3rd Qu.: 78.00   3rd Qu.: 87.36  
##                      Max.   :188.5   Max.   :100.50   Max.   :123.47  
##       Chol            BMI             HDL         Statin         Educ      
##  Min.   :2.230   Min.   :18.34   Min.   :-1.000   No :317   Min.   :0.000  
##  1st Qu.:4.610   1st Qu.:23.74   1st Qu.: 1.160   Yes: 96   1st Qu.:1.000  
##  Median :5.310   Median :25.69   Median : 1.400             Median :2.000  
##  Mean   :5.356   Mean   :26.57   Mean   : 1.417             Mean   :1.753  
##  3rd Qu.:5.970   3rd Qu.:28.56   3rd Qu.: 1.670             3rd Qu.:3.000  
##  Max.   :9.220   Max.   :60.95   Max.   : 2.810             Max.   :3.000
# summary of outcome = RFFT
# numerical descriptives
summary(prevend.final2$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   44.00   64.00   66.58   85.00  136.00
sd(prevend.final2$RFFT)
## [1] 27.27229
# graphical descriptives
# boxplot
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(prevend.final2$RFFT,
        main="Boxplot of RFFT showing variability in Outcome",
        range=0, # this suppresses outliers since we are overlaying points
        ylab = "RFFT")
# add individual points to boxplot to show spread
# using stripchart function
stripchart(prevend.final2$RFFT, vertical=TRUE, add=TRUE, method="jitter",
           jitter=0.03,col='blue',cex=1.1, pch=20)
# add mean to boxplot with points function
points(mean(prevend.final2$RFFT),cex=2,pch=16,col="red")
# add a legend
legend("topright", c("Data Points", "Sample Mean","Sample Median"),
       cex=0.8,lty=c(1,1,1),lwd=c(3,3,3),
       col = c("blue","red","black",bty ="n"))
# add text with mean Y and Variance Y
text(1.16,80,"Mean =",adj=c(-1,0),col="red")
text(1.26,80,round(mean(prevend.final2$RFFT),2),adj=c(-1,0),col="red")
text(1.2,70,"SD =",adj=c(-1,0),col="blue")
text(1.26,70,round(sd(prevend.final2$RFFT),2),adj=c(-1,0),col="blue")

# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)

# What is the relationship between RFFT (cognitive function) and statin use?

# visualize data (RFFT vs Statin) using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
        geom_boxplot()+
        geom_jitter(shape=16, position=position_jitter(0.1))

# MODEL 1 - examines the association between cognitive ability 
#           and statin use alone
# Simple Linear Regression: RFFT ~ Statin
model1  <- lm(RFFT ~ Statin, data = prevend.final2)
summary(model1)
## 
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.290 -22.290   0.365  17.710  66.710 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   69.290      1.508  45.937  < 2e-16 ***
## StatinYes    -11.655      3.129  -3.725 0.000222 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.86 on 411 degrees of freedom
## Multiple R-squared:  0.03266,    Adjusted R-squared:  0.03031 
## F-statistic: 13.88 on 1 and 411 DF,  p-value: 0.0002224
# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use

# Model 2 examines the association between cognitive ability
# and statin use adjusted for potential confounders including 
# age, educational level and presence of cardiovascular disease.  
model2 <- lm(RFFT ~ Statin + Age + Education + CVD,
                    data=prevend.final2)
summary(model2)  
## 
## Call:
## lm(formula = RFFT ~ Statin + Age + Education + CVD, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.588 -16.136  -0.937  12.769  64.377 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            99.1409     6.9534  14.258  < 2e-16 ***
## StatinYes               4.0501     2.8222   1.435  0.15203    
## Age                    -0.8959     0.1028  -8.718  < 2e-16 ***
## EducationLowerSecond    8.0293     3.6325   2.210  0.02764 *  
## EducationHigherSecond  19.0765     3.8640   4.937 1.16e-06 ***
## EducationUniv          30.2396     3.7876   7.984 1.47e-14 ***
## CVDYes                -13.3537     4.2858  -3.116  0.00196 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.3 on 406 degrees of freedom
## Multiple R-squared:  0.3991, Adjusted R-squared:  0.3902 
## F-statistic: 44.95 on 6 and 406 DF,  p-value: < 2.2e-16
# check model diagnostics
plot(model2)

# check variance inflation factors
library(car)
## Loading required package: carData
vif(model2)
##               GVIF Df GVIF^(1/(2*Df))
## Statin    1.294016  1        1.137548
## Age       1.264724  1        1.124599
## Education 1.162706  3        1.025443
## CVD       1.161296  1        1.077634