# ~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Assignment 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~
# QUESTION 1 -- PREVEND DATA
# Prevention of REnal and Vascular END-stage Disease
# (PREVEND) study
# A study by Joosten, et al. examined the association of statin use
# and other variables with cognitive ability (as measured by the
# Ruff Figural Fluency Test (RFFT) measure of cognitive function)
# in a cross-sectional cohort of 4,095 participants from the
# Netherlands who were part of the larger PREVEND study.
# RFFT Scores range from 0 (worst) to 175 (best).
# The dataset we will work with includes a random sample of 500
# participants from the cohort.
# Data Dictionary
# Variables
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Age Age in years (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
# Sex Sex at birth ("Male","Female") (Factor Variable)
# Educ Education (integer: 0=primary school; 1=Lower secondary school;
# 2=Higher Secondary School; 3=University)
# Education ("Primary"; LowerSecond; HigherSecond; Univ) (Factor Variable)
# CVD 1=CVD present; 0=CVD absent (Factor Variable)
# SBP Systolic Blood Pressure (Numeric)
# DBP Diastolic Blood Pressure (Numeric)
# Smoking Smoking Status ("No","Yes") (Factor Variable)
# Chol Total Cholesterol (mmol/L) (Numeric)
# HDL HDL Cholesterol (mmol/L) (Numeric)
# DM Type 2 Diabetes status ("No","Yes") (Factor Variable)
# eGFR EGFR (mL/min/1.73m2) (Numeric)
# Download and load the data file = prevend
load(url("https://www.duke.edu/~sgrambow/crp241data/prevend2024.RData"))
# examine structure of the data
str(prevend2024)
## 'data.frame': 500 obs. of 14 variables:
## $ Age : int 55 65 46 68 70 53 64 70 46 44 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 2 1 ...
## $ Education: Factor w/ 4 levels "Primary","LowerSecond",..: 3 2 4 3 3 1 2 4 3 4 ...
## $ RFFT : int 62 79 89 70 35 14 31 47 88 91 ...
## $ CVD : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DM : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SBP : num 122 108 120 114 114 ...
## $ DBP : num 63.5 66.5 75 67 72.5 75 77 76.5 99 70 ...
## $ eGFR : num 83.3 76.5 76.4 61.2 88.1 ...
## $ Chol : num 3.86 5.64 6.83 7.11 5.04 3.05 4.9 5.5 3.92 5.75 ...
## $ HDL : num 1.54 1.53 1.04 1.85 1.4 0.79 1.23 1.57 1.39 1.18 ...
## $ Statin : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ Educ : int 2 1 3 2 2 0 1 3 2 3 ...
# create subgroups by Statin Use status. These subgroups
# can be used as inputs to some of the functions we want
# to use to describe and test the data
yes.statin <- subset(prevend2024,Statin=="Yes")
no.statin <- subset(prevend2024,Statin=="No")
# lets use a boxplot to visualize the data and assess structural
# characteristics of the data
boxplot(yes.statin$RFFT,no.statin$RFFT,
main="RFFT by Statin Use",
ylab="RFFT",
xlab="Statin Use",
names=c("Statin User","Not Using Statin"),
col=c("light blue","orange"))
# add mean to each boxplot with points function
points(c(mean(yes.statin$RFFT),mean(no.statin$RFFT)),cex=1,pch=16,col="brown")
# let's calculate some additional descriptive statistics
summary(yes.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.00 57.00 60.66 73.00 134.00
summary(no.statin$RFFT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 47.00 71.00 70.71 91.00 136.00
# and separately calculate SD for each group since
# that is not included in summary()
sd(yes.statin$RFFT)
## [1] 24.80658
sd(no.statin$RFFT)
## [1] 27.73613
# To examine the relationship between RFFT score and statin
# use status, the researchers plan to compare the mean RFFT
# score among participants using statins vs. those not using
# statins. The researchers used the equal variances version
# of the t-test to address their research question.
# Which of the following are correct summaries of the point
# and interval estimates from that analysis (mark all that
# are correct)? Use the code in the RScript file to answer
# this question (note that you are welcome to write and run
# additional R code to help answer this question if you find
# it helpful to do so).
# Test Procedure 1
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
t.test(prevend2024$RFFT ~ prevend2024$Statin, var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: prevend2024$RFFT by prevend2024$Statin
## t = 3.7085, df = 206.49, p-value = 0.000268
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 4.708753 15.398079
## sample estimates:
## mean in group No mean in group Yes
## 70.71429 60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 2
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
t.test(prevend2024$RFFT ~ prevend2024$Statin, var.equal=TRUE)
##
## Two Sample t-test
##
## data: prevend2024$RFFT by prevend2024$Statin
## t = 3.4917, df = 498, p-value = 0.0005226
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 4.396556 15.710276
## sample estimates:
## mean in group No mean in group Yes
## 70.71429 60.66087
mean(no.statin$RFFT)-mean(yes.statin$RFFT)
## [1] 10.05342
# Test Procedure 3
#
# RFFT Ruff Figural Fluency Test (RFFT) cognitive function (Integer)
# Statin Statin use ("No","Yes") (Factor Variable)
summary(aov(RFFT~Statin, data=prevend2024))
## Df Sum Sq Mean Sq F value Pr(>F)
## Statin 1 8950 8950 12.19 0.000523 ***
## Residuals 498 365560 734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# install package if required
# install.packages("pairwiseCI")
library(pairwiseCI)
## Loading required package: MCPAN
## Loading required package: coin
## Loading required package: survival
pairwiseCI(RFFT~Statin, data=prevend2024,var.equal = TRUE,p.adjust="none")
##
## 95 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## Yes-No -10.05 -15.71 -4.397
##
##
# QUESTION 2 -- ORBITA
# if you want to do the sample size calculations in R
# we recommend you use the RScript code discussed in class
# which can be found in the Teams folder
# QUESTION 3 -- MENTAL ILLNESS ANALYSIS USING ANOVA
# PLP1 Gene Expression Data
# Researchers studying mental illness compared
# expression levels of several genes involved
# in the production and activity of myelin, a
# tissue important in nerve function, in the
# brains of 15 persons with schizophrenia,
# 15 with bipolar disorder, and 15 control
# individuals. The results presented in the
# accompanying table summarize the expression
# levels of the proteolipid protein 1 gene (PLP1)
# in the 45 brains. The main objective was to
# compare whether PLP1 gene expression differs
# among the schizophrenia, bipolar disorder, and
# control groups using one-way ANOVA.
# Load the dataset
load(url("http://www.duke.edu/~sgrambow/crp241data/gene_express.RData"))
# examine data structure
str(gene_express)
## 'data.frame': 45 obs. of 2 variables:
## $ group: chr "control" "control" "control" "control" ...
## $ plp1 : num -0.02 -0.27 -0.11 0.09 0.25 -0.02 0.48 -0.24 0.06 0.07 ...
# data management -- lets create a factor variable version
# of the group variable and call it f.group
gene_express$f.group <- factor(gene_express$group,
labels=c("bipolar","control", "schizo"))
# check coding
table(gene_express$group,gene_express$f.group)
##
## bipolar control schizo
## bipolar 15 0 0
## control 0 15 0
## schizo 0 0 15
# subsetting to create data frame for each group
control <- subset(gene_express,gene_express$f.group=='control')
bipolar <- subset(gene_express,gene_express$f.group=='bipolar')
schizo <- subset(gene_express,gene_express$f.group=='schizo')
# numerical descriptives by group
summary(bipolar$plp1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4300 -0.3500 -0.3200 -0.2627 -0.2350 0.0600
summary(control$plp1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.300 -0.170 -0.020 -0.004 0.080 0.480
summary(schizo$plp1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4000 -0.3250 -0.2300 -0.1953 -0.1100 0.2300
# summary doesnt produce SD so:
sd(bipolar$plp1)
## [1] 0.1512551
sd(control$plp1)
## [1] 0.2179056
sd(schizo$plp1)
## [1] 0.1822818
# lets use a boxplot to visualize the data and
# assess structural characteristics.
boxplot(gene_express$plp1 ~ gene_express$group,
main="PLP1 Expression by Group",
ylab="Normalized Expression",
xlab="Group",
range=0)
# add means to boxplot
group.means <- c(mean(bipolar$plp1),mean(control$plp1),
mean(schizo$plp1))
points(group.means,col="red",pch=20,cex=2)
# add indvidual points overlaid on each boxplot
stripchart(plp1 ~ group, vertical = TRUE, data = gene_express,
method = "jitter", add = TRUE, pch = 20, col = 'blue')
# Perform One-Way ANOVA
summary(aov(plp1~group,data=gene_express))
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 0.5403 0.27013 7.823 0.00129 **
## Residuals 42 1.4502 0.03453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise comparisons
library(pairwiseCI)
# mean comparison method 1
pairwiseCI(plp1 ~ group,data=gene_express,var.equal = TRUE,p.adjust="none")
##
## 95 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## control-bipolar 0.2587 0.1184 0.3990
## schizo-bipolar 0.0673 -0.0579 0.1926
## schizo-control -0.1913 -0.3416 -0.0411
##
##
summary(pairwiseTest(plp1 ~ group,data=gene_express),var.equal=TRUE,
p.adjust.method='none')
## p.val.adj p.val.raw comparison groupx groupy
## control-bipolar 0.0009 0.0009 control-bipolar bipolar control
## schizo-bipolar 0.2806 0.2806 schizo-bipolar bipolar schizo
## schizo-control 0.0146 0.0146 schizo-control control schizo
# mean comparison method 2
pairwiseCI(plp1 ~ group,data=gene_express,var.equal = TRUE,p.adjust="bonf")
##
## 95 %-confidence intervals
## Method: Difference of means assuming Normal distribution and equal variances
##
##
## estimate lower upper
## control-bipolar 0.2587 0.1184 0.3990
## schizo-bipolar 0.0673 -0.0579 0.1926
## schizo-control -0.1913 -0.3416 -0.0411
##
##
summary(pairwiseTest(plp1 ~ group,data=gene_express),var.equal=TRUE,
p.adjust.method='bonf')
## p.val.adj p.val.raw comparison groupx groupy
## control-bipolar 0.0026 0.0009 control-bipolar bipolar control
## schizo-bipolar 0.8418 0.2806 schizo-bipolar bipolar schizo
## schizo-control 0.0438 0.0146 schizo-control control schizo
# mean comparison method 3
TukeyHSD(aov(plp1 ~ group,data=gene_express))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = plp1 ~ group, data = gene_express)
##
## $group
## diff lwr upr p adj
## control-bipolar 0.25866667 0.09382065 0.42351268 0.0012670
## schizo-bipolar 0.06733333 -0.09751268 0.23217935 0.5857148
## schizo-control -0.19133333 -0.35617935 -0.02648732 0.0195775
# NOTE -- here is how you can create the table used in this
# assignment using the table1 package
# first need to install package
# only need to run install.packages one time for each new package
# install.packages("table1")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table.data <- prevend2024
# here is a link to a vignette showing how to use this function
# also shows some advanced features
# https://benjaminrich.github.io/table1/vignettes/table1-examples.html
# Before we generate the table
# We should put nice labels on our variables
# for categorical variables, we want to convert them to factor variables.
# We already have factor variables so we are good to go there.
#
# We can also label our continuous variables using the label function
label(table.data$Chol) <- "Total Cholesterol (mmol/L) "
label(table.data$HDL) <- " HDL Cholesterol (mmol/L)"
# We can create a nicely formatted table with
# a single function call to table1
# Default behavior is to produce Min and Max instead of Q1 and Q3.
# Let's change that
table1(~ Age + Sex + CVD + Chol + HDL | Statin,
render.continuous=c(.="Mean (SD)", .="Median [Q1 - Q3]"),
data=table.data,
rowlabelhead = "Statin Use",
overall=NULL)
Statin Use | No (N=385) |
Yes (N=115) |
---|---|---|
Age | ||
Mean (SD) | 52.8 (11.4) | 61.4 (9.71) |
Median [Q1 - Q3] | 51.0 [44.0 - 61.0] | 61.0 [54.5 - 69.5] |
Sex | ||
Male | 192 (49.9%) | 73 (63.5%) |
Female | 193 (50.1%) | 42 (36.5%) |
CVD | ||
No | 374 (97.1%) | 86 (74.8%) |
Yes | 11 (2.9%) | 29 (25.2%) |
Total Cholesterol (mmol/L) | ||
Mean (SD) | 5.52 (1.03) | 4.92 (1.18) |
Median [Q1 - Q3] | 5.44 [4.86 - 6.18] | 4.78 [4.15 - 5.49] |
HDL Cholesterol (mmol/L) | ||
Mean (SD) | 1.45 (0.379) | 1.30 (0.351) |
Median [Q1 - Q3] | 1.40 [1.19 - 1.67] | 1.27 [1.04 - 1.53] |
# End of Program
# End of Program