# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Module 2 Day 2 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~

## Download and load the Site 18 Acupuncture dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/site18.RData",
              destfile = "site18.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("site18.RData")

# Check: Was the data read in correctly?
# - How many observations are in the data 
#   set? Variables?
str(site18_data)
## 'data.frame':    43 obs. of  14 variables:
##  $ id           : int  536 539 540 546 547 548 549 550 553 554 ...
##  $ age          : int  26 53 54 54 39 37 40 33 49 50 ...
##  $ sex          : int  1 0 1 1 1 1 1 0 1 1 ...
##  $ migraine     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ chronicity   : int  10 10 40 19 7 26 5 10 31 37 ...
##  $ acupuncturist: int  6 6 6 6 6 6 6 6 6 6 ...
##  $ practice_id  : int  18 18 18 18 18 18 18 18 18 18 ...
##  $ group        : int  0 0 1 1 0 0 0 1 0 1 ...
##  $ pk1          : num  32.5 11.2 28.2 17 17 ...
##  $ pk2          : num  NA 18.8 13.8 NA NA ...
##  $ pk5          : num  NA NA 10 NA NA ...
##  $ f1           : num  17 10 18 10 14 24 13 23 28 13 ...
##  $ f2           : int  0 19 11 0 0 17 9 0 28 9 ...
##  $ f5           : int  0 0 9 0 0 9 11 0 28 11 ...
# Calculate summary measures for chronicity 
# in the overall sample: 

# Option 1: Use the summary function
# - To select the chronicity variable out of 
#   the data frame, use the $ after the name
#   of the data frame, followed by the name
#   of the chronicity variable
summary(site18_data$chronicity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   10.00   19.00   20.56   32.00   41.00
# Option 2: Calculate summary measures 
#           individually using measure-
#           specific functions
mean(site18_data$chronicity)      # Mean/Average
## [1] 20.55814
median(site18_data$chronicity)    # Median
## [1] 19
var(site18_data$chronicity)       # Variance
## [1] 171.7763
sd(site18_data$chronicity)        # Standard Deviation
## [1] 13.10635
IQR(site18_data$chronicity)       # Interquartile Range
## [1] 22
# Calcualte summary measures for chronicity 
# by treatment group:
# (1) Subset data on treatment group
acupunc <- subset(site18_data,group==1)   # Treatment = Acupuncture
control <- subset(site18_data,group==0)   # Control = No Acupuncture
# Check: Were the subsets created correctly?
# - How many observations are in the data 
#   set? Variables?

# (2) Calculate group specific summary measures:
# For Treatment Group:
mean(acupunc$chronicity)      
## [1] 21.25
median(acupunc$chronicity)    
## [1] 18.5
var(acupunc$chronicity)       
## [1] 159.6711
sd(acupunc$chronicity)        
## [1] 12.6361
IQR(acupunc$chronicity)    
## [1] 21.75
# For Control Group:
mean(control$chronicity)      
## [1] 19.95652
median(control$chronicity)    
## [1] 20
var(control$chronicity)       
## [1] 189.2253
sd(control$chronicity)        
## [1] 13.75592
IQR(control$chronicity)
## [1] 23.5
# Creat a histogram of chronicity in the 
# overall sample:
# - A histogram is essentially a bar graph 
#   that groups the values of a continuous 
#   variable into bins and plots the freq 
#   of each bin (or number of data values 
#   that fall into each bin). 
# - Now have 3 data sets in the workspace; 
#   which one should be used here?

# Standard Output:
hist(site18_data$chronicity)

# With more informative titles/labels:
hist(site18_data$chronicity,
     main="Histogram of Chronicity in Overall Sample",
     xlab="Chronicity")

# Create side-by-side box plots of chronicity 
# by treatment group:
# - A box plots plots horizontal lines at the 
#   min, first quartile, median, third quartile, 
#   and max of the data values. A box is drawn
#   around the first and third quartiles, and 
#   lines are extended out to the max and min. 
#   Potential outliers may be indicated by dots. 

# Standard Output:
boxplot(site18_data$chronicity ~ site18_data$group)

site18_data$cgroup <- ifelse(site18_data$group > 0,
                             c("acupunc"),c("control"))
boxplot(site18_data$chronicity ~ site18_data$cgroup)

# With more informative titles/labels:
boxplot(site18_data$chronicity ~ site18_data$group,
        main="Chronicity by Treatment Group",
        ylab="Chronicity",
        xlab="Treatment Group",
        names=c("Control","Acupuncture"))

# One important descriptive stat that is often 
# overlooked is the number of missing values 
# for each variable in a data set.
summary(site18_data)
##        id             age            sex           migraine     
##  Min.   :536.0   Min.   :23.0   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:556.5   1st Qu.:39.5   1st Qu.:1.000   1st Qu.:1.0000  
##  Median :580.0   Median :48.0   Median :1.000   Median :1.0000  
##  Mean   :587.4   Mean   :46.7   Mean   :0.907   Mean   :0.9535  
##  3rd Qu.:607.5   3rd Qu.:54.5   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :692.0   Max.   :65.0   Max.   :1.000   Max.   :1.0000  
##                                                                 
##    chronicity    acupuncturist  practice_id     group       
##  Min.   : 2.00   Min.   :6     Min.   :18   Min.   :0.0000  
##  1st Qu.:10.00   1st Qu.:6     1st Qu.:18   1st Qu.:0.0000  
##  Median :19.00   Median :6     Median :18   Median :0.0000  
##  Mean   :20.56   Mean   :6     Mean   :18   Mean   :0.4651  
##  3rd Qu.:32.00   3rd Qu.:6     3rd Qu.:18   3rd Qu.:1.0000  
##  Max.   :41.00   Max.   :6     Max.   :18   Max.   :1.0000  
##                                                             
##       pk1             pk2             pk5              f1       
##  Min.   : 9.00   Min.   : 4.75   Min.   : 2.00   Min.   : 4.00  
##  1st Qu.:16.62   1st Qu.:13.06   1st Qu.:10.88   1st Qu.:10.00  
##  Median :22.50   Median :18.25   Median :16.75   Median :14.00  
##  Mean   :25.16   Mean   :24.96   Mean   :20.34   Mean   :15.31  
##  3rd Qu.:31.00   3rd Qu.:32.50   3rd Qu.:24.00   3rd Qu.:20.00  
##  Max.   :68.50   Max.   :98.50   Max.   :73.75   Max.   :28.00  
##                  NA's   :9       NA's   :12                     
##        f2              f5            cgroup         
##  Min.   : 0.00   Min.   : 0.000   Length:43         
##  1st Qu.: 6.50   1st Qu.: 0.000   Class :character  
##  Median :11.00   Median : 9.000   Mode  :character  
##  Mean   :11.26   Mean   : 9.093                     
##  3rd Qu.:17.00   3rd Qu.:13.000                     
##  Max.   :28.00   Max.   :28.000                     
## 
# How many values are missing for the variable 
# headache severity at 1 year?

# Suppose we wanted to calculate the mean 
# head severity at 1 year in the overall 
# sample. How should we handle the missing 
# values?
mean(site18_data$pk5)
## [1] NA
mean(site18_data$pk5,na.rm=TRUE)
## [1] 20.33871
# So far, only discussed calculating summary
# measures for continous variables. Suppose
# we wanted to calculate summary measures 
# for the categorical variable sex: 

# Calculate frequncies for sex: 
table(site18_data$sex)
## 
##  0  1 
##  4 39
# Calculate proportions for sex: 
prop.table(table(site18_data$sex))
## 
##          0          1 
## 0.09302326 0.90697674
# You Try It

# (1) Find the mean and median age of 
#     patients enrolled at Site 18. 

# SOLUTION for (1)
# Approach #1 
mean(site18_data$age)
## [1] 46.69767
median(site18_data$age)
## [1] 48
# Approach #2 
summary(site18_data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    23.0    39.5    48.0    46.7    54.5    65.0
# (2) Does age differ by treatment group in
#     this cohort? To address this question, 
#     create side-by-side boxplots. 

# SOLUTION for (2)
# Boxplot With informative titles/labels:
boxplot(site18_data$age ~ site18_data$group,
        main="Age by Treatment Group",
        ylab="Age (in years)",
        xlab="Treatment Group",
        names=c("Control","Acupuncture"))

# Interpretation
# There appears to be more variability in age within control
# Median age is higher in control vs treatment

# (3) How many patients enrolled at Site 18
#     were diagnosed to have tension-type
#     headaches at enrollment

# Solution for last number 3
# Variable is migraine
# here 0 is a tension type and 1 is a migraine type
# headache

# one approach is to use mean or summary
mean(site18_data$migraine) 
## [1] 0.9534884
# this gives 95.3 percent
# you could also use the table function to get 
# counts
table(site18_data$migraine)
## 
##  0  1 
##  2 41