# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~
# last updated on 10/25/21
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study

# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the 
# Ruff Figural Fluency Test (RFFT) measure of cognitive function) 
# in a cross-sectional cohort of 4,095 participants from the 
# Netherlands who were part of the larger PREVEND study. 
# RFFT Scores range from 0 (worst) to 175 (best). 
# The dataset we will work with includes a random sample of 500 
# participants from the cohort. 

# Data Dictionary

# Variables
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Age        Age in years (Integer) 
# Statin     Statin use ("No","Yes") (Factor Variable) 
# Sex        Sex at birth ("Male","Female") (Factor Variable) 
# Educ       Education (integer: 0=primary school; 1=Lower secondary school;
#                       2=Higher Secondary School; 3=University)
# Education  ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable) 
# CVD        1=CVD present; 0=CVD absent (Factor Variable)
# SBP        Systolic Blood Pressure (Numeric)
# DBP        Diastolic Blood Pressure (Numeric)
# Smoking    Smoking Status ("No","Yes") (Factor Variable) 
# Chol       Total Cholesterol (mmol/L) (Numeric)
# HDL        HDL Cholesterol (mmol/L) (Numeric)
# DM         Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR       EGFR (mL/min/1.73m2) (Numeric)

# Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))

# if the above load command doesnt run for you try the
# set of commands below to load the data frame
# highlight and run all 3 lines of code
download.file("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData",
              destfile = "prevend.final2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("prevend.final2.RData")

# examine structure of the data 
str(prevend.final2)
## 'data.frame':    500 obs. of  14 variables:
##  $ Age      : int  55 65 46 68 70 53 64 70 46 44 ...
##  $ Sex      : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
##  $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
##  $ RFFT     : int  62 79 89 70 35 14 31 47 88 91 ...
##  $ CVD      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DM       : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ Smoking  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SBP      : num  122 108 120 114 114 ...
##  $ DBP      : num  63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
##  $ eGFR     : num  83.3 76.5 76.4 61.2 88.1 ...
##  $ Chol     : num  3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
##  $ HDL      : num  1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
##  $ Statin   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Educ     : int  2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status.  These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend.final2,Statin=="Yes")  
no.statin <- subset(prevend.final2,Statin=="No")  

# lets use a boxplot to visualize the data and assess structural 
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
        main="RFFT by Statin Use",
        ylab="RFFT",
        xlab="Statin Use",
        names=c("Statin User","Not Using Statin"),
        col="light blue")

# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="red")

# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   44.00   57.00   60.66   73.00  134.00
summary(no.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   47.00   71.00   70.71   91.00  136.00
# and separately calculate SD for each group since 
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# Question 1
# The researchers decide to use the equal variances version of 
# the t-test to address their research question.  Which of the 
# following is a correct summary of the point and interval estimates 
# from that analysis (mark all that are correct)?  Use the code in 
# the RScript file to answer this question (note that you are welcome 
# to write and run additional R code to help answer this question).

# Test Procedure 1
#
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin     Statin use ("No","Yes") (Factor Variable) 
t.test(prevend.final2$RFFT ~ prevend.final2$Statin, var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  prevend.final2$RFFT by prevend.final2$Statin
## t = 3.7085, df = 206.49, p-value = 0.000268
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##   4.708753 15.398079
## sample estimates:
##  mean in group No mean in group Yes 
##          70.71429          60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 2
#
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin     Statin use ("No","Yes") (Factor Variable) 
t.test(prevend.final2$RFFT ~ prevend.final2$Statin, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  prevend.final2$RFFT by prevend.final2$Statin
## t = 3.4917, df = 498, p-value = 0.0005226
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##   4.396556 15.710276
## sample estimates:
##  mean in group No mean in group Yes 
##          70.71429          60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 3
#
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin     Statin use ("No","Yes") (Factor Variable) 
summary(aov(RFFT~Statin, data=prevend.final2))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Statin        1   8950    8950   12.19 0.000523 ***
## Residuals   498 365560     734                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# install package if required
# install.packages("pairwiseCI")
library(pairwiseCI)
## Loading required package: MCPAN
## Loading required package: coin
## Loading required package: survival
pairwiseCI(RFFT~Statin, data=prevend.final2,var.equal = TRUE,p.adjust="none")           
##   
## 95 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##        estimate  lower  upper
## Yes-No   -10.05 -15.71 -4.397
##   
## 
# Functional Polymorphisms Associated with human Muscle Size and Strength
# study (FA-MuSS) measured a variety of demographic, phenotypic, and genetic
# characteristics for about 1,300 participants. Data from the study have been
# used in a number of subsequent studies, such as one examining the
# relationship between muscle strength and genotype at a location on the 
# ACTN3 gene. The famuss dataset is a subset of the data for 595 participants.

# Data Dictionary

# Variables
# ndrm.ch       Percent change in strength in non-dominant arm, comparing
#               strength after to before training (numeric)
# drm.ch        Percent change in strength in dominant arm, comparing
#               strength after to before training (numeric)
# sex           sex of the participant ("Female", "Male") (Factor Variable)
# age           age in years (integer)
# race          race ("African Am", "Asian", "Caucasian", "Hispanic", "Other")
#               (Factor Variable)
# height        height in inches (numeric)
# weight        weight in pounds (numeric)
# actn3.r577x   Genotype at the location r577x in the ACTN3 gene (Factor Variable)
# bmi           bmi in kg/m^2 (numeric)

# Load the dataset
load(url("http://www.duke.edu/~sgrambow/crp241data/famuss.RData"))
str(famuss)
## 'data.frame':    595 obs. of  9 variables:
##  $ ndrm.ch    : num  40 25 40 125 40 75 100 57.1 33.3 20 ...
##  $ drm.ch     : num  40 0 0 0 20 0 0 -14.3 0 0 ...
##  $ sex        : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 1 2 1 ...
##  $ age        : int  27 36 24 40 32 24 30 28 27 30 ...
##  $ race       : Factor w/ 5 levels "African Am","Asian",..: 3 3 3 3 3 4 3 3 4 3 ...
##  $ height     : num  65 71.7 65 68 61 62.2 65 68 68.2 62.2 ...
##  $ weight     : num  199 189 134 171 118 120 134 162 189 120 ...
##  $ actn3.r577x: Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 1 2 3 2 1 2 ...
##  $ bmi        : num  33.1 25.8 22.3 26 22.3 ...
##  - attr(*, "na.action")= 'omit' Named int [1:802] 8 11 15 17 18 26 27 29 30 34 ...
##   ..- attr(*, "names")= chr [1:802] "8" "11" "15" "17" ...
# grouping variable is a factor -- lets determine order/level of genotypes
str(famuss$actn3.r577x)
##  Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 1 2 3 2 1 2 ...
# so it is CC, CT, TT

# and determine how many participants with each genotype
summary(famuss$actn3.r577x)
##  CC  CT  TT 
## 173 261 161
# CC with n=173, CT with n=261, TT with n=161

# subsetting to create data frame for each group 
CC.group <- subset(famuss,famuss$actn3.r577x=="CC")
CT.group <- subset(famuss,famuss$actn3.r577x=="CT")
TT.group <- subset(famuss,famuss$actn3.r577x=="TT")

# lets use a boxplot to visualize the data and 
# assess structural characteristics.
boxplot(famuss$ndrm.ch ~ famuss$actn3.r577x,
        main="Percent change in strength in non-dominant arm (after to before)
        by genotype",
        ylab="Percent change in strength",
        xlab="Genotype",
        names=c("CC","CT","TT"),
        col=("light blue"))

# add means to boxplot
points(c(mean(CC.group$ndrm.ch),mean(CT.group$ndrm.ch),
         mean(TT.group$ndrm.ch)),col="red",pch=20,cex=1)

# let's calculate some additional descriptive statistics
summary(CC.group$ndrm.ch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   30.00   42.90   48.89   66.70  150.00
summary(CT.group$ndrm.ch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   30.00   45.50   53.25   71.40  250.00
summary(TT.group$ndrm.ch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   33.30   50.00   58.08   75.00  233.30
# and separately calculate SD for each group since 
# that is not included in summary()
sd(CC.group$ndrm.ch)
## [1] 29.95608
sd(CT.group$ndrm.ch)
## [1] 33.23148
sd(TT.group$ndrm.ch)
## [1] 35.69133
# Perform One-Way ANOVA
summary(aov(ndrm.ch~actn3.r577x,data=famuss))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## actn3.r577x   2   7043    3522   3.231 0.0402 *
## Residuals   592 645293    1090                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Pairwise comparisons library
library(pairwiseCI)

#  mean comparison method 1
pairwiseCI(ndrm.ch~actn3.r577x,data=famuss,var.equal = TRUE,conf.level = .95)
##   
## 95 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##       estimate  lower upper
## CT-CC    4.355 -1.805 10.51
## TT-CC    9.190  2.114 16.26
## TT-CT    4.835 -1.900 11.57
##   
## 
summary(pairwiseTest(ndrm.ch~actn3.r577x,data=famuss),var.equal=TRUE,
        p.adjust.method='none')
##       p.val.adj p.val.raw comparison groupx groupy
## CT-CC    0.1567    0.1567      CT-CC     CC     CT
## TT-CC    0.0116    0.0116      TT-CC     CC     TT
## TT-CT    0.1663    0.1663      TT-CT     CT     TT
# mean comparison method 2
pairwiseCI(ndrm.ch~actn3.r577x,data=famuss,var.equal = TRUE,conf.level = 1-0.05/3)
##   
## 98.33 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##       estimate   lower upper
## CT-CC    4.355 -3.1775 11.89
## TT-CC    9.190  0.5352 17.84
## TT-CT    4.835 -3.4005 13.07
##   
## 
summary(pairwiseTest(ndrm.ch~actn3.r577x,data=famuss),var.equal=TRUE,
        p.adjust.method='bonferroni')
##       p.val.adj p.val.raw comparison groupx groupy
## CT-CC    0.4701    0.1567      CT-CC     CC     CT
## TT-CC    0.0348    0.0116      TT-CC     CC     TT
## TT-CT    0.4988    0.1663      TT-CT     CT     TT
# mean comparison method 3
TukeyHSD(aov(ndrm.ch~actn3.r577x,data=famuss))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ndrm.ch ~ actn3.r577x, data = famuss)
## 
## $actn3.r577x
##           diff        lwr      upr     p adj
## CT-CC 4.354822 -3.2504838 11.96013 0.3706250
## TT-CC 9.189631  0.6948462 17.68442 0.0302867
## TT-CT 4.834809 -2.9390775 12.60870 0.3104223
# End of Program