# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~

# QUESTION 1 -- PREVEND DATA

# Prevention of REnal and Vascular END-stage Disease 
# (PREVEND) study

# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the 
# Ruff Figural Fluency Test (RFFT) measure of cognitive function) 
# in a cross-sectional cohort of 4,095 participants from the 
# Netherlands who were part of the larger PREVEND study. 
# RFFT Scores range from 0 (worst) to 175 (best). 
# The dataset we will work with includes a random sample of 500 
# participants from the cohort. 

# Data Dictionary

# Variables
# RFFT       Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Age        Age in years (Integer) 
# Statin     Statin use ("No","Yes") (Factor Variable) 
# Sex        Sex at birth ("Male","Female") (Factor Variable) 
# Educ       Education (integer: 0=primary school; 1=Lower secondary school;
#                       2=Higher Secondary School; 3=University)
# Education  ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable) 
# CVD        1=CVD present; 0=CVD absent (Factor Variable)
# SBP        Systolic Blood Pressure (Numeric)
# DBP        Diastolic Blood Pressure (Numeric)
# Smoking    Smoking Status ("No","Yes") (Factor Variable) 
# Chol       Total Cholesterol (mmol/L) (Numeric)
# HDL        HDL Cholesterol (mmol/L) (Numeric)
# DM         Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR       EGFR (mL/min/1.73m2) (Numeric)

# Download and load the data file = prevend
load(url("https://www.duke.edu/~sgrambow/crp241data/prevend2024.RData"))

# examine structure of the data 
str(prevend2024)
## 'data.frame':    500 obs. of  14 variables:
##  $ Age      : int  55 65 46 68 70 53 64 70 46 44 ...
##  $ Sex      : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
##  $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
##  $ RFFT     : int  62 79 89 70 35 14 31 47 88 91 ...
##  $ CVD      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DM       : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ Smoking  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SBP      : num  122 108 120 114 114 ...
##  $ DBP      : num  63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
##  $ eGFR     : num  83.3 76.5 76.4 61.2 88.1 ...
##  $ Chol     : num  3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
##  $ HDL      : num  1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
##  $ Statin   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Educ     : int  2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status.  These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend2024,Statin=="Yes")  
no.statin <- subset(prevend2024,Statin=="No")  

# lets use a boxplot to visualize the data and assess structural 
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
        main="RFFT by Statin Use",
        ylab="RFFT",
        xlab="Statin Use",
        names=c("Statin User","Not Using Statin"),
        col=c("light blue","orange"))

# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="brown")

# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   44.00   57.00   60.66   73.00  134.00
summary(no.statin$RFFT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   47.00   71.00   70.71   91.00  136.00
# and separately calculate SD for each group since 
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# To examine the relationship between RFFT score and statin 
# use status, the researchers plan to compare the mean RFFT 
# score among participants using statins vs. those not using 
# statins.  The researchers used the equal variances version 
# of the t-test to address their research question.  
# Which of the following are correct summaries of the point 
# and interval estimates from that analysis (mark all that 
# are correct)?  Use the code in the RScript file to answer 
# this question (note that you are welcome to write and run 
# additional R code to help answer this question if you find 
# it helpful to do so).

# Test Procedure 1
#
# RFFT    Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin  Statin use ("No","Yes") (Factor Variable) 
t.test(prevend2024$RFFT ~ prevend2024$Statin, var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  prevend2024$RFFT by prevend2024$Statin
## t = 3.7085, df = 206.49, p-value = 0.000268
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##   4.708753 15.398079
## sample estimates:
##  mean in group No mean in group Yes 
##          70.71429          60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 2
#
# RFFT    Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin  Statin use ("No","Yes") (Factor Variable) 
t.test(prevend2024$RFFT ~ prevend2024$Statin, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  prevend2024$RFFT by prevend2024$Statin
## t = 3.4917, df = 498, p-value = 0.0005226
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##   4.396556 15.710276
## sample estimates:
##  mean in group No mean in group Yes 
##          70.71429          60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 3
#
# RFFT    Ruff Figural Fluency Test (RFFT) cognitive function (Integer)          
# Statin  Statin use ("No","Yes") (Factor Variable) 
summary(aov(RFFT~Statin, data=prevend2024))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Statin        1   8950    8950   12.19 0.000523 ***
## Residuals   498 365560     734                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# install package if required
# install.packages("pairwiseCI")
library(pairwiseCI)
## Loading required package: MCPAN
## Loading required package: coin
## Loading required package: survival
pairwiseCI(RFFT~Statin, data=prevend2024,var.equal = TRUE,p.adjust="none")           
##   
## 95 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##        estimate  lower  upper
## Yes-No   -10.05 -15.71 -4.397
##   
## 
# QUESTION 2 -- ORBITA
# if you want to do the sample size calculations in R
# we recommend you use the RScript code discussed in class
#  which can be found in the Teams folder
# QUESTION 3 -- MENTAL ILLNESS ANALYSIS USING ANOVA
# PLP1 Gene Expression Data

# Researchers studying mental illness compared 
# expression levels of several genes involved 
# in the production and activity of myelin, a 
# tissue important in nerve function, in the 
# brains of 15 persons with schizophrenia, 
# 15 with bipolar disorder, and 15 control 
# individuals. The results presented in the 
# accompanying table summarize the expression 
# levels of the proteolipid protein 1 gene (PLP1) 
# in the 45 brains.  The main objective was to 
# compare whether PLP1 gene expression differs 
# among the schizophrenia, bipolar disorder, and 
# control groups using one-way ANOVA.  

# Load the dataset
load(url("http://www.duke.edu/~sgrambow/crp241data/gene_express.RData"))

# examine data structure
str(gene_express)
## 'data.frame':    45 obs. of  2 variables:
##  $ group: chr  "control" "control" "control" "control" ...
##  $ plp1 : num  -0.02 -0.27 -0.11 0.09 0.25 -0.02 0.48 -0.24 0.06 0.07 ...
# data management -- lets create a factor variable version
# of the group variable and call it f.group
gene_express$f.group <- factor(gene_express$group,
                               labels=c("bipolar","control", "schizo"))

# check coding
table(gene_express$group,gene_express$f.group)
##          
##           bipolar control schizo
##   bipolar      15       0      0
##   control       0      15      0
##   schizo        0       0     15
# subsetting to create data frame for each group 
control <- subset(gene_express,gene_express$f.group=='control')
bipolar <- subset(gene_express,gene_express$f.group=='bipolar')
schizo <- subset(gene_express,gene_express$f.group=='schizo')


# numerical descriptives by group
summary(bipolar$plp1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4300 -0.3500 -0.3200 -0.2627 -0.2350  0.0600
summary(control$plp1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -0.300  -0.170  -0.020  -0.004   0.080   0.480
summary(schizo$plp1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4000 -0.3250 -0.2300 -0.1953 -0.1100  0.2300
# summary doesnt produce SD so:
sd(bipolar$plp1)
## [1] 0.1512551
sd(control$plp1)
## [1] 0.2179056
sd(schizo$plp1)
## [1] 0.1822818
# lets use a boxplot to visualize the data and 
# assess structural characteristics.
boxplot(gene_express$plp1 ~ gene_express$group,
        main="PLP1 Expression by Group",
        ylab="Normalized Expression",
        xlab="Group",
        range=0)

# add means to boxplot
group.means <- c(mean(bipolar$plp1),mean(control$plp1),
                 mean(schizo$plp1))
points(group.means,col="red",pch=20,cex=2)

# add indvidual points overlaid on each boxplot
stripchart(plp1 ~ group, vertical = TRUE, data = gene_express, 
           method = "jitter", add = TRUE, pch = 20, col = 'blue')

# Perform One-Way ANOVA
summary(aov(plp1~group,data=gene_express))
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## group        2 0.5403 0.27013   7.823 0.00129 **
## Residuals   42 1.4502 0.03453                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise comparisons
library(pairwiseCI)

# mean comparison method 1
pairwiseCI(plp1 ~ group,data=gene_express,var.equal = TRUE,p.adjust="none")
##   
## 95 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##                 estimate   lower   upper
## control-bipolar   0.2587  0.1184  0.3990
## schizo-bipolar    0.0673 -0.0579  0.1926
## schizo-control   -0.1913 -0.3416 -0.0411
##   
## 
summary(pairwiseTest(plp1 ~ group,data=gene_express),var.equal=TRUE,
        p.adjust.method='none')
##                 p.val.adj p.val.raw      comparison  groupx  groupy
## control-bipolar    0.0009    0.0009 control-bipolar bipolar control
## schizo-bipolar     0.2806    0.2806  schizo-bipolar bipolar  schizo
## schizo-control     0.0146    0.0146  schizo-control control  schizo
# mean comparison method 2
pairwiseCI(plp1 ~ group,data=gene_express,var.equal = TRUE,p.adjust="bonf")
##   
## 95 %-confidence intervals 
##  Method:  Difference of means assuming Normal distribution and equal variances 
##   
##   
##                 estimate   lower   upper
## control-bipolar   0.2587  0.1184  0.3990
## schizo-bipolar    0.0673 -0.0579  0.1926
## schizo-control   -0.1913 -0.3416 -0.0411
##   
## 
summary(pairwiseTest(plp1 ~ group,data=gene_express),var.equal=TRUE,
        p.adjust.method='bonf')
##                 p.val.adj p.val.raw      comparison  groupx  groupy
## control-bipolar    0.0026    0.0009 control-bipolar bipolar control
## schizo-bipolar     0.8418    0.2806  schizo-bipolar bipolar  schizo
## schizo-control     0.0438    0.0146  schizo-control control  schizo
# mean comparison method 3
TukeyHSD(aov(plp1 ~ group,data=gene_express))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = plp1 ~ group, data = gene_express)
## 
## $group
##                        diff         lwr         upr     p adj
## control-bipolar  0.25866667  0.09382065  0.42351268 0.0012670
## schizo-bipolar   0.06733333 -0.09751268  0.23217935 0.5857148
## schizo-control  -0.19133333 -0.35617935 -0.02648732 0.0195775
# NOTE -- here is how you can create the table used in this
# assignment using the table1 package

# first need to install package
# only need to run install.packages one time for each new package
# install.packages("table1")
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table.data <- prevend2024
# here is a link to a vignette showing how to use this function
# also shows some advanced features
# https://benjaminrich.github.io/table1/vignettes/table1-examples.html

# Before we generate the table
# We should put nice labels on our variables

# for categorical variables, we want to convert them to factor variables.
# We already have factor variables so we are good to go there.
#
# We can also label our continuous variables using the label function
label(table.data$Chol) <- "Total Cholesterol (mmol/L) "
label(table.data$HDL) <- " HDL Cholesterol (mmol/L)" 


# We can create a nicely formatted table with 
# a single function call to table1
# Default behavior is to produce Min and Max instead of Q1 and Q3.  
# Let's change that 
table1(~ Age + Sex + CVD + Chol + HDL | Statin, 
       render.continuous=c(.="Mean (SD)", .="Median [Q1 - Q3]"),
       data=table.data,
       rowlabelhead = "Statin Use",
       overall=NULL)
Statin Use No
(N=385)
Yes
(N=115)
Age
Mean (SD) 52.8 (11.4) 61.4 (9.71)
Median [Q1 - Q3] 51.0 [44.0 - 61.0] 61.0 [54.5 - 69.5]
Sex
Male 192 (49.9%) 73 (63.5%)
Female 193 (50.1%) 42 (36.5%)
CVD
No 374 (97.1%) 86 (74.8%)
Yes 11 (2.9%) 29 (25.2%)
Total Cholesterol (mmol/L)
Mean (SD) 5.52 (1.03) 4.92 (1.18)
Median [Q1 - Q3] 5.44 [4.86 - 6.18] 4.78 [4.15 - 5.49]
HDL Cholesterol (mmol/L)
Mean (SD) 1.45 (0.379) 1.30 (0.351)
Median [Q1 - Q3] 1.40 [1.19 - 1.67] 1.27 [1.04 - 1.53]
# End of Program

# End of Program