#
# CRP245 Spring 2021 Final Exam
#
# R Code for Question 4-7
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study

# Variables
# RFFT       Ruff Figural Fluency Test (RFFT) measure of cognitive function         
# Age        Age in years
# Statin     1=Statin User; 0=otherwise
# Sex        sex at birth (0=males; 1=females)
# Educ       Education (numeric: 0=primary school; 1=Lower secondary school;
#                       2=Higher Secondary School; 3=University)
# Education  Factor Variable (Primary; LowerSecond; HigherSecond; Univ)
# CVD        1=CVD present; 0=CVD absent
# SBP        Systolic Blood Pressure
# DBP        Diastolic Blood Pressure
# BMI        Body Mass Index (kg/m2)
# Smoking    1=smoker; 0=non-smoker
# Chol       Total Cholesterol (mmol/L)
# HDL        HDL Cholesterol (mmol/L)
# DM         1=Type 2 Diabetes YES; 0=No Type 2 Diab
# eGFR       EGFR (mL/min/1.73m2)

## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))

# structure of the data 
str(prevend.final2)
## 'data.frame':    500 obs. of  14 variables:
##  $ Age      : int  55 65 46 68 70 53 64 70 46 44 ...
##  $ Sex      : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
##  $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
##  $ RFFT     : int  62 79 89 70 35 14 31 47 88 91 ...
##  $ CVD      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DM       : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ Smoking  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SBP      : num  122 108 120 114 114 ...
##  $ DBP      : num  63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
##  $ eGFR     : num  83.3 76.5 76.4 61.2 88.1 ...
##  $ Chol     : num  3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
##  $ HDL      : num  1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
##  $ Statin   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Educ     : int  2 1 3 2 2 0 1 3 2 3 ...
# summary of overall data -- any missing values?
summary(prevend.final2)
##       Age            Sex             Education        RFFT        CVD     
##  Min.   :36.00   Male  :265   Primary     : 51   Min.   : 11.0   No :460  
##  1st Qu.:46.00   Female:235   LowerSecond :157   1st Qu.: 46.0   Yes: 40  
##  Median :54.00                HigherSecond:134   Median : 67.0            
##  Mean   :54.82                Univ        :158   Mean   : 68.4            
##  3rd Qu.:64.00                                   3rd Qu.: 88.0            
##  Max.   :81.00                                   Max.   :136.0            
##    DM      Smoking        SBP             DBP              eGFR       
##  No :467   No :384   Min.   : 77.5   Min.   : 48.00   Min.   : 38.48  
##  Yes: 33   Yes:116   1st Qu.:112.0   1st Qu.: 66.50   1st Qu.: 68.54  
##                      Median :123.0   Median : 72.00   Median : 76.36  
##                      Mean   :125.8   Mean   : 73.05   Mean   : 77.73  
##                      3rd Qu.:137.0   3rd Qu.: 78.62   3rd Qu.: 87.32  
##                      Max.   :188.5   Max.   :101.50   Max.   :131.50  
##       Chol            HDL         Statin         Educ      
##  Min.   :2.230   Min.   :-1.000   No :385   Min.   :0.000  
##  1st Qu.:4.647   1st Qu.: 1.150   Yes:115   1st Qu.:1.000  
##  Median :5.335   Median : 1.380             Median :2.000  
##  Mean   :5.382   Mean   : 1.405             Mean   :1.798  
##  3rd Qu.:6.053   3rd Qu.: 1.643             3rd Qu.:3.000  
##  Max.   :9.220   Max.   : 2.810             Max.   :3.000
# summary of outcome = RFFT

# numerical descriptives
summary(prevend.final2$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    11.0    46.0    67.0    68.4    88.0   136.0
sd(prevend.final2$RFFT)
## [1] 27.39565
# KEY POINT
# mean of RFFT:        68.1
# median of RFFT:      66.0
# stand dev of RFFT:   27.50

# graphical descriptives
# boxplot
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(prevend.final2$RFFT,
        main="Boxplot of RFFT showing variability in Outcome",
        range=0, # this suppresses outliers since we are overlaying points
        ylab = "RFFT")
# add individual points to boxplot to show spread
# using stripchart function
stripchart(prevend.final2$RFFT, vertical=TRUE, add=TRUE, method="jitter",
           jitter=0.03,col='blue',cex=1.1, pch=20)
# add mean to boxplot with points function
points(mean(prevend.final2$RFFT),cex=2,pch=16,col="red")
# add a legend
legend("topright", c("Data Points", "Sample Mean","Sample Median"),
       cex=0.8,lty=c(1,1,1),lwd=c(3,3,3),
       col = c("blue","red","black",bty ="n"))
# add text with mean Y and Variance Y
text(1.16,80,"Mean =",adj=c(-1,0),col="red")
text(1.26,80,round(mean(prevend.final2$RFFT),2),adj=c(-1,0),col="red")
text(1.2,70,"SD =",adj=c(-1,0),col="blue")
text(1.26,70,round(sd(prevend.final2$RFFT),2),adj=c(-1,0),col="blue")

# Load needed packages
# need ggplot2 for visualizations of regression
# install.packages("ggplot2")
library(ggplot2)

# What is the relationship between RFFT (cognitive function) and 
# education level?
#

#create histogram: RFFT by Education
ggplot(prevend.final2, aes(x=Education, y=RFFT, fill=Education))+
        geom_boxplot()+
        geom_jitter(shape=16, position=position_jitter(0.1))

# NOTE: Model below uses numeric version of Education variable (Educ)
# MODEL 1 -- EDUC.SIMP.MODEL
# Simple Linear Regression: RFFT ~ Educ 
educ.simp.model  <- lm(RFFT ~ Educ, data = prevend.final2)
summary(educ.simp.model)
## 
## Call:
## lm(formula = RFFT ~ Educ, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.622 -16.148  -0.885  15.536  62.694 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   41.148      2.104   19.55   <2e-16 ***
## Educ          15.158      1.023   14.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.85 on 498 degrees of freedom
## Multiple R-squared:  0.3059, Adjusted R-squared:  0.3045 
## F-statistic: 219.5 on 1 and 498 DF,  p-value: < 2.2e-16
# Check Regression Diagnostics
plot(educ.simp.model)

# NOTE: Model below uses Factor version of Education variable (Education)
# Simple Linear Regression: RFFT ~ Education 
education.simp.model  <- lm(RFFT ~ Education, data = prevend.final2)
summary(education.simp.model)
## 
## Call:
## lm(formula = RFFT ~ Education, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.905 -15.975  -0.905  16.068  63.280 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             40.941      3.203  12.783  < 2e-16 ***
## EducationLowerSecond    14.779      3.686   4.009 7.04e-05 ***
## EducationHigherSecond   32.133      3.763   8.539  < 2e-16 ***
## EducationUniv           44.964      3.684  12.207  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.87 on 496 degrees of freedom
## Multiple R-squared:  0.3072, Adjusted R-squared:  0.303 
## F-statistic:  73.3 on 3 and 496 DF,  p-value: < 2.2e-16
# Check Regression Diagnostics
plot(education.simp.model)

#
# What is the relationship between RFFT (cognitive function) and statin use?
#

# visualize data (RFFT vs Statin) using ggplot boxplot
ggplot(prevend.final2, aes(x=Statin, y=RFFT, fill=Statin))+
        geom_boxplot()+
        geom_jitter(shape=16, position=position_jitter(0.1))

# MODEL 2 - STATIN.SIMP.MODEL
# Simple Linear Regression: RFFT ~ Statin
statin.simp.model  <- lm(RFFT ~ Statin, data = prevend.final2)
summary(statin.simp.model)
## 
## Call:
## lm(formula = RFFT ~ Statin, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.714 -22.714   0.286  18.299  73.339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   70.714      1.381  51.212  < 2e-16 ***
## StatinYes    -10.053      2.879  -3.492 0.000523 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.09 on 498 degrees of freedom
## Multiple R-squared:  0.0239, Adjusted R-squared:  0.02194 
## F-statistic: 12.19 on 1 and 498 DF,  p-value: 0.0005226
# Check Regression Diagnostics
plot(statin.simp.model)

# Consideration of Age as a confounder of the relationship between
# RFFT and Statin Use

# Visualizing relationships: RFFT by age with Statin Use
ggplot(prevend.final2, aes(x=Age, y=RFFT, shape=Statin,
                           color=Statin))+
        geom_point()

# MODEL 3 - STATIN.AGE.MODEL
statin.age.model  <- lm(RFFT ~ Statin + Age, data = prevend.final2)
summary(statin.age.model)
## 
## Call:
## lm(formula = RFFT ~ Statin + Age, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.855 -16.860  -1.178  15.730  58.751 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 137.8822     5.1221  26.919   <2e-16 ***
## StatinYes     0.8509     2.5957   0.328    0.743    
## Age          -1.2710     0.0943 -13.478   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.21 on 497 degrees of freedom
## Multiple R-squared:  0.2852, Adjusted R-squared:  0.2823 
## F-statistic: 99.13 on 2 and 497 DF,  p-value: < 2.2e-16
# Check Regression Diagnostics
plot(statin.age.model)

#
# What is the relationship between RFFT (cognitive function) and age?
#

# visualize data (RFFT vs Age) using ggplot scatter plot
ggplot(prevend.final2, aes(x=Age, y=RFFT))+
        geom_point() +
        geom_smooth(method=lm,formula= y~x,se=TRUE)

# Simple Linear Regression: RFFT ~ Age
age.simp.model  <- lm(RFFT ~ Age, data = prevend.final2)
summary(age.simp.model)
## 
## Call:
## lm(formula = RFFT ~ Age, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.879 -16.845  -1.095  15.524  58.564 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 137.54972    5.01614   27.42   <2e-16 ***
## Age          -1.26136    0.08953  -14.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.19 on 498 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2836 
## F-statistic: 198.5 on 1 and 498 DF,  p-value: < 2.2e-16
# Check Regression Diagnostics
plot(age.simp.model)

# MODEL 4 - INTERACTION.AGE.STATIN.MODEL
# INTERACTION MODEL FOR RFFT AGE AND STATIN USE
# 
interaction.age.statin.model <- lm(RFFT ~ Statin + Age + Statin*Age,
                                   data=prevend.final2)
summary(interaction.age.statin.model)  
## 
## Call:
## lm(formula = RFFT ~ Statin + Age + Statin * Age, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.551 -16.963  -1.179  15.764  58.802 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   140.2031     5.6209  24.943   <2e-16 ***
## StatinYes     -13.9720    15.0113  -0.931    0.352    
## Age            -1.3149     0.1040 -12.646   <2e-16 ***
## StatinYes:Age   0.2474     0.2468   1.003    0.317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.21 on 496 degrees of freedom
## Multiple R-squared:  0.2866, Adjusted R-squared:  0.2823 
## F-statistic: 66.42 on 3 and 496 DF,  p-value: < 2.2e-16
# MODEL 5 - STATIN.MULT.MODEL
# Running Multivariable Models to Account for Potential Confounding
# Consideration of multiple confounders: Age, Education, CVD status,
statin.mult.model <- lm(RFFT ~ Statin + Age + Education + CVD,
                    data=prevend.final2)
summary(statin.mult.model)  
## 
## Call:
## lm(formula = RFFT ~ Statin + Age + Education + CVD, data = prevend.final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.348 -15.586  -0.136  13.795  63.935 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           99.03507    6.33012  15.645  < 2e-16 ***
## StatinYes              4.69045    2.44802   1.916  0.05594 .  
## Age                   -0.92029    0.09041 -10.179  < 2e-16 ***
## EducationLowerSecond  10.08831    3.37556   2.989  0.00294 ** 
## EducationHigherSecond 21.30146    3.57768   5.954 4.98e-09 ***
## EducationUniv         33.12464    3.54710   9.339  < 2e-16 ***
## CVDYes                -7.56655    3.65164  -2.072  0.03878 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.71 on 493 degrees of freedom
## Multiple R-squared:  0.4355, Adjusted R-squared:  0.4286 
## F-statistic: 63.38 on 6 and 493 DF,  p-value: < 2.2e-16
# check model diagnostics
par(mfrow=c(2,2))
plot(statin.mult.model)

par(mfrow=c(1,1))

# check variance inflation factors
library(car)
## Loading required package: carData
vif(statin.mult.model)
##               GVIF Df GVIF^(1/(2*Df))
## Statin    1.237387  1        1.112379
## Age       1.278789  1        1.130835
## Education 1.201504  3        1.031069
## CVD       1.144221  1        1.069683