# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~
# last updated on 10/25/21
# Prevention of REnal and Vascular END-stage Disease (PREVEND) study
# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the
# Ruff Figural Fluency Test (RFFT) measure of cognitive function)
# in a cross-sectional cohort of 4,095 participants from the
# Netherlands who were part of the larger PREVEND study.
# RFFT Scores range from 0 (worst) to 175 (best).
# The dataset we will work with includes a random sample of 500
# participants from the cohort.
# Data Dictionary
# Variables
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Age Age in years (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
# Sex Sex at birth ("Male","Female") (Factor Variable)
# Educ Education (integer: 0=primary school; 1=Lower secondary school;
# 2=Higher Secondary School; 3=University)
# Education ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable)
# CVD 1=CVD present; 0=CVD absent (Factor Variable)
# SBP Systolic Blood Pressure (Numeric)
# DBP Diastolic Blood Pressure (Numeric)
# Smoking Smoking Status ("No","Yes") (Factor Variable)
# Chol Total Cholesterol (mmol/L) (Numeric)
# HDL HDL Cholesterol (mmol/L) (Numeric)
# DM Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR EGFR (mL/min/1.73m2) (Numeric)
# Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData"))
# if the above load command doesnt run for you try the
# set of commands below to load the data frame
# highlight and run all 3 lines of code
download.file("http://www.duke.edu/~sgrambow/crp241data/prevend.final2.RData",
destfile = "prevend.final2.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("prevend.final2.RData")
# examine structure of the data
str(prevend.final2)
## 'data.frame': 500 obs. of 14 variables:
## $ Age : int 55 65 46 68 70 53 64 70 46 44 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
## $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
## $ RFFT : int 62 79 89 70 35 14 31 47 88 91 ...
## $ CVD : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DM : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SBP : num 122 108 120 114 114 ...
## $ DBP : num 63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
## $ eGFR : num 83.3 76.5 76.4 61.2 88.1 ...
## $ Chol : num 3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
## $ HDL : num 1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
## $ Statin : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ Educ : int 2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status. These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend.final2,Statin=="Yes")
no.statin <- subset(prevend.final2,Statin=="No")
# lets use a boxplot to visualize the data and assess structural
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
main="RFFT by Statin Use",
ylab="RFFT",
xlab="Statin Use",
names=c("Statin User","Not Using Statin"),
col="light blue")
# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="red")

# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.00 57.00 60.66 73.00 134.00
summary(no.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 47.00 71.00 70.71 91.00 136.00
# and separately calculate SD for each group since
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# Question 1
# The researchers decide to use the equal variances version of
# the t-test to address their research question. Which of the
# following is a correct summary of the point and interval estimates
# from that analysis (mark all that are correct)? Use the code in
# the RScript file to answer this question (note that you are welcome
# to write and run additional R code to help answer this question).
# Test Procedure 1
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
t.test(prevend.final2$RFFT ~ prevend.final2$Statin, var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: prevend.final2$RFFT by prevend.final2$Statin
## t = 3.7085, df = 206.49, p-value = 0.000268
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 4.708753 15.398079
## sample estimates:
## mean in group No mean in group Yes
## 70.71429 60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 2
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
t.test(prevend.final2$RFFT ~ prevend.final2$Statin, var.equal=TRUE)
##
## Two Sample t-test
##
## data: prevend.final2$RFFT by prevend.final2$Statin
## t = 3.4917, df = 498, p-value = 0.0005226
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 4.396556 15.710276
## sample estimates:
## mean in group No mean in group Yes
## 70.71429 60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 3
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
summary(aov(RFFT~Statin, data=prevend.final2))
## Df Sum Sq Mean Sq F value Pr(>F)
## Statin 1 8950 8950 12.19 0.000523 ***
## Residuals 498 365560 734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# install package if required
# install.packages("pairwiseCI")
library(pairwiseCI)
## Loading required package: MCPAN
## Loading required package: coin
## Loading required package: survival
pairwiseCI(RFFT~Statin, data=prevend.final2,var.equal = TRUE,p.adjust="none")
##
## 95 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## Yes-No -10.05 -15.71 -4.397
##
##
# Functional Polymorphisms Associated with human Muscle Size and Strength
# study (FA-MuSS) measured a variety of demographic, phenotypic, and genetic
# characteristics for about 1,300 participants. Data from the study have been
# used in a number of subsequent studies, such as one examining the
# relationship between muscle strength and genotype at a location on the
# ACTN3 gene. The famuss dataset is a subset of the data for 595 participants.
# Data Dictionary
# Variables
# ndrm.ch Percent change in strength in non-dominant arm, comparing
# strength after to before training (numeric)
# drm.ch Percent change in strength in dominant arm, comparing
# strength after to before training (numeric)
# sex sex of the participant ("Female", "Male") (Factor Variable)
# age age in years (integer)
# race race ("African Am", "Asian", "Caucasian", "Hispanic", "Other")
# (Factor Variable)
# height height in inches (numeric)
# weight weight in pounds (numeric)
# actn3.r577x Genotype at the location r577x in the ACTN3 gene (Factor Variable)
# bmi bmi in kg/m^2 (numeric)
# Load the dataset
load(url("http://www.duke.edu/~sgrambow/crp241data/famuss.RData"))
str(famuss)
## 'data.frame': 595 obs. of 9 variables:
## $ ndrm.ch : num 40 25 40 125 40 75 100 57.1 33.3 20 ...
## $ drm.ch : num 40 0 0 0 20 0 0 -14.3 0 0 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 1 2 1 ...
## $ age : int 27 36 24 40 32 24 30 28 27 30 ...
## $ race : Factor w/ 5 levels "African Am","Asian",..: 3 3 3 3 3 4 3 3 4 3 ...
## $ height : num 65 71.7 65 68 61 62.2 65 68 68.2 62.2 ...
## $ weight : num 199 189 134 171 118 120 134 162 189 120 ...
## $ actn3.r577x: Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 1 2 3 2 1 2 ...
## $ bmi : num 33.1 25.8 22.3 26 22.3 ...
## - attr(*, "na.action")= 'omit' Named int [1:802] 8 11 15 17 18 26 27 29 30 34 ...
## ..- attr(*, "names")= chr [1:802] "8" "11" "15" "17" ...
# grouping variable is a factor -- lets determine order/level of genotypes
str(famuss$actn3.r577x)
## Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 1 2 3 2 1 2 ...
# so it is CC, CT, TT
# and determine how many participants with each genotype
summary(famuss$actn3.r577x)
## CC CT TT
## 173 261 161
# CC with n=173, CT with n=261, TT with n=161
# subsetting to create data frame for each group
CC.group <- subset(famuss,famuss$actn3.r577x=="CC")
CT.group <- subset(famuss,famuss$actn3.r577x=="CT")
TT.group <- subset(famuss,famuss$actn3.r577x=="TT")
# lets use a boxplot to visualize the data and
# assess structural characteristics.
boxplot(famuss$ndrm.ch ~ famuss$actn3.r577x,
main="Percent change in strength in non-dominant arm (after to before)
by genotype",
ylab="Percent change in strength",
xlab="Genotype",
names=c("CC","CT","TT"),
col=("light blue"))
# add means to boxplot
points(c(mean(CC.group$ndrm.ch),mean(CT.group$ndrm.ch),
mean(TT.group$ndrm.ch)),col="red",pch=20,cex=1)

# let's calculate some additional descriptive statistics
summary(CC.group$ndrm.ch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 30.00 42.90 48.89 66.70 150.00
summary(CT.group$ndrm.ch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 30.00 45.50 53.25 71.40 250.00
summary(TT.group$ndrm.ch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 33.30 50.00 58.08 75.00 233.30
# and separately calculate SD for each group since
# that is not included in summary()
sd(CC.group$ndrm.ch)
## [1] 29.95608
sd(CT.group$ndrm.ch)
## [1] 33.23148
sd(TT.group$ndrm.ch)
## [1] 35.69133
# Perform One-Way ANOVA
summary(aov(ndrm.ch~actn3.r577x,data=famuss))
## Df Sum Sq Mean Sq F value Pr(>F)
## actn3.r577x 2 7043 3522 3.231 0.0402 *
## Residuals 592 645293 1090
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load Pairwise comparisons library
library(pairwiseCI)
# mean comparison method 1
pairwiseCI(ndrm.ch~actn3.r577x,data=famuss,var.equal = TRUE,conf.level = .95)
##
## 95 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## CT-CC 4.355 -1.805 10.51
## TT-CC 9.190 2.114 16.26
## TT-CT 4.835 -1.900 11.57
##
##
summary(pairwiseTest(ndrm.ch~actn3.r577x,data=famuss),var.equal=TRUE,
p.adjust.method='none')
## p.val.adj p.val.raw comparison groupx groupy
## CT-CC 0.1567 0.1567 CT-CC CC CT
## TT-CC 0.0116 0.0116 TT-CC CC TT
## TT-CT 0.1663 0.1663 TT-CT CT TT
# mean comparison method 2
pairwiseCI(ndrm.ch~actn3.r577x,data=famuss,var.equal = TRUE,conf.level = 1-0.05/3)
##
## 98.33 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## CT-CC 4.355 -3.1775 11.89
## TT-CC 9.190 0.5352 17.84
## TT-CT 4.835 -3.4005 13.07
##
##
summary(pairwiseTest(ndrm.ch~actn3.r577x,data=famuss),var.equal=TRUE,
p.adjust.method='bonferroni')
## p.val.adj p.val.raw comparison groupx groupy
## CT-CC 0.4701 0.1567 CT-CC CC CT
## TT-CC 0.0348 0.0116 TT-CC CC TT
## TT-CT 0.4988 0.1663 TT-CT CT TT
# mean comparison method 3
TukeyHSD(aov(ndrm.ch~actn3.r577x,data=famuss))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = ndrm.ch ~ actn3.r577x, data = famuss)
##
## $actn3.r577x
## diff lwr upr p adj
## CT-CC 4.354822 -3.2504838 11.96013 0.3706250
## TT-CC 9.189631 0.6948462 17.68442 0.0302867
## TT-CT 4.834809 -2.9390775 12.60870 0.3104223
# End of Program