# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Module 2 Day 2 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Download and load the Site 18 Acupuncture dataset:
download.file("http://www.duke.edu/~sgrambow/crp241data/site18.RData",
destfile = "site18.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("site18.RData")
# Check: Was the data read in correctly?
# - How many observations are in the data
# set? Variables?
str(site18_data)
## 'data.frame': 43 obs. of 14 variables:
## $ id : int 536 539 540 546 547 548 549 550 553 554 ...
## $ age : int 26 53 54 54 39 37 40 33 49 50 ...
## $ sex : int 1 0 1 1 1 1 1 0 1 1 ...
## $ migraine : int 1 1 1 1 1 1 1 1 1 1 ...
## $ chronicity : int 10 10 40 19 7 26 5 10 31 37 ...
## $ acupuncturist: int 6 6 6 6 6 6 6 6 6 6 ...
## $ practice_id : int 18 18 18 18 18 18 18 18 18 18 ...
## $ group : int 0 0 1 1 0 0 0 1 0 1 ...
## $ pk1 : num 32.5 11.2 28.2 17 17 ...
## $ pk2 : num NA 18.8 13.8 NA NA ...
## $ pk5 : num NA NA 10 NA NA ...
## $ f1 : num 17 10 18 10 14 24 13 23 28 13 ...
## $ f2 : int 0 19 11 0 0 17 9 0 28 9 ...
## $ f5 : int 0 0 9 0 0 9 11 0 28 11 ...
# Calculate summary measures for chronicity
# in the overall sample:
# Option 1: Use the summary function
# - To select the chronicity variable out of
# the data frame, use the $ after the name
# of the data frame, followed by the name
# of the chronicity variable
summary(site18_data$chronicity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 10.00 19.00 20.56 32.00 41.00
# Option 2: Calculate summary measures
# individually using measure-
# specific functions
mean(site18_data$chronicity) # Mean/Average
## [1] 20.55814
median(site18_data$chronicity) # Median
## [1] 19
var(site18_data$chronicity) # Variance
## [1] 171.7763
sd(site18_data$chronicity) # Standard Deviation
## [1] 13.10635
IQR(site18_data$chronicity) # Interquartile Range
## [1] 22
# Calcualte summary measures for chronicity
# by treatment group:
# (1) Subset data on treatment group
acupunc <- subset(site18_data,group==1) # Treatment = Acupuncture
control <- subset(site18_data,group==0) # Control = No Acupuncture
# Check: Were the subsets created correctly?
# - How many observations are in the data
# set? Variables?
# (2) Calculate group specific summary measures:
# For Treatment Group:
mean(acupunc$chronicity)
## [1] 21.25
median(acupunc$chronicity)
## [1] 18.5
var(acupunc$chronicity)
## [1] 159.6711
sd(acupunc$chronicity)
## [1] 12.6361
IQR(acupunc$chronicity)
## [1] 21.75
# For Control Group:
mean(control$chronicity)
## [1] 19.95652
median(control$chronicity)
## [1] 20
var(control$chronicity)
## [1] 189.2253
sd(control$chronicity)
## [1] 13.75592
IQR(control$chronicity)
## [1] 23.5
# Creat a histogram of chronicity in the
# overall sample:
# - A histogram is essentially a bar graph
# that groups the values of a continuous
# variable into bins and plots the freq
# of each bin (or number of data values
# that fall into each bin).
# - Now have 3 data sets in the workspace;
# which one should be used here?
# Standard Output:
hist(site18_data$chronicity)

# With more informative titles/labels:
hist(site18_data$chronicity,
main="Histogram of Chronicity in Overall Sample",
xlab="Chronicity")

# Create side-by-side box plots of chronicity
# by treatment group:
# - A box plots plots horizontal lines at the
# min, first quartile, median, third quartile,
# and max of the data values. A box is drawn
# around the first and third quartiles, and
# lines are extended out to the max and min.
# Potential outliers may be indicated by dots.
# Standard Output:
boxplot(site18_data$chronicity ~ site18_data$group)

site18_data$cgroup <- ifelse(site18_data$group > 0,
c("acupunc"),c("control"))
boxplot(site18_data$chronicity ~ site18_data$cgroup)

# With more informative titles/labels:
boxplot(site18_data$chronicity ~ site18_data$group,
main="Chronicity by Treatment Group",
ylab="Chronicity",
xlab="Treatment Group",
names=c("Control","Acupuncture"))

# One important descriptive stat that is often
# overlooked is the number of missing values
# for each variable in a data set.
summary(site18_data)
## id age sex migraine
## Min. :536.0 Min. :23.0 Min. :0.000 Min. :0.0000
## 1st Qu.:556.5 1st Qu.:39.5 1st Qu.:1.000 1st Qu.:1.0000
## Median :580.0 Median :48.0 Median :1.000 Median :1.0000
## Mean :587.4 Mean :46.7 Mean :0.907 Mean :0.9535
## 3rd Qu.:607.5 3rd Qu.:54.5 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :692.0 Max. :65.0 Max. :1.000 Max. :1.0000
##
## chronicity acupuncturist practice_id group
## Min. : 2.00 Min. :6 Min. :18 Min. :0.0000
## 1st Qu.:10.00 1st Qu.:6 1st Qu.:18 1st Qu.:0.0000
## Median :19.00 Median :6 Median :18 Median :0.0000
## Mean :20.56 Mean :6 Mean :18 Mean :0.4651
## 3rd Qu.:32.00 3rd Qu.:6 3rd Qu.:18 3rd Qu.:1.0000
## Max. :41.00 Max. :6 Max. :18 Max. :1.0000
##
## pk1 pk2 pk5 f1
## Min. : 9.00 Min. : 4.75 Min. : 2.00 Min. : 4.00
## 1st Qu.:16.62 1st Qu.:13.06 1st Qu.:10.88 1st Qu.:10.00
## Median :22.50 Median :18.25 Median :16.75 Median :14.00
## Mean :25.16 Mean :24.96 Mean :20.34 Mean :15.31
## 3rd Qu.:31.00 3rd Qu.:32.50 3rd Qu.:24.00 3rd Qu.:20.00
## Max. :68.50 Max. :98.50 Max. :73.75 Max. :28.00
## NA's :9 NA's :12
## f2 f5 cgroup
## Min. : 0.00 Min. : 0.000 Length:43
## 1st Qu.: 6.50 1st Qu.: 0.000 Class :character
## Median :11.00 Median : 9.000 Mode :character
## Mean :11.26 Mean : 9.093
## 3rd Qu.:17.00 3rd Qu.:13.000
## Max. :28.00 Max. :28.000
##
# How many values are missing for the variable
# headache severity at 1 year?
# Suppose we wanted to calculate the mean
# head severity at 1 year in the overall
# sample. How should we handle the missing
# values?
mean(site18_data$pk5)
## [1] NA
mean(site18_data$pk5,na.rm=TRUE)
## [1] 20.33871
# So far, only discussed calculating summary
# measures for continous variables. Suppose
# we wanted to calculate summary measures
# for the categorical variable sex:
# Calculate frequncies for sex:
table(site18_data$sex)
##
## 0 1
## 4 39
# Calculate proportions for sex:
prop.table(table(site18_data$sex))
##
## 0 1
## 0.09302326 0.90697674
# You Try It
# (1) Find the mean and median age of
# patients enrolled at Site 18.
# SOLUTION for (1)
# Approach #1
mean(site18_data$age)
## [1] 46.69767
median(site18_data$age)
## [1] 48
# Approach #2
summary(site18_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.0 39.5 48.0 46.7 54.5 65.0
# (2) Does age differ by treatment group in
# this cohort? To address this question,
# create side-by-side boxplots.
# SOLUTION for (2)
# Boxplot With informative titles/labels:
boxplot(site18_data$age ~ site18_data$group,
main="Age by Treatment Group",
ylab="Age (in years)",
xlab="Treatment Group",
names=c("Control","Acupuncture"))

# Interpretation
# There appears to be more variability in age within control
# Median age is higher in control vs treatment
# (3) How many patients enrolled at Site 18
# were diagnosed to have tension-type
# headaches at enrollment
# Solution for last number 3
# Variable is migraine
# here 0 is a tension type and 1 is a migraine type
# headache
# one approach is to use mean or summary
mean(site18_data$migraine)
## [1] 0.9534884
# this gives 95.3 percent
# you could also use the table function to get
# counts
table(site18_data$migraine)
##
## 0 1
## 2 41