# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 10 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Cholesterol Data Example: 
# Data was obtained from a study conducted to examine 
# the relationship between total serum cholesterol 
# levels and heart attacks among subjects who experienced 
# a recent ACS event. A total of 28 subjects were 
# recruited for the study. Each subject had their 
# cholesterol levels measured at 2 days, 4 days, 
# and 14 days post heart attack.

# Load the data: 
load(url("http://www.duke.edu/~sgrambow/crp241data/cholesterol-paired.RData"))

# Data Set: cholesterol.paired

# Data Dictionary: 
# (1) patient    patient identifier 
# (2) DAY2       total serum cholesterol (mg/dL) 
#                measured 2 days post-MI
# (3) DAY4       total serum cholesterol (mg/dL) 
#                measured 4 days post-MI
# (4) DAY14      total serum cholesterol (mg/dL) 
#                measured 14 days post-MI
# (5) DAY14MSNG  binary flag for DAY14 missingness 
#                (1 = missing vs. 0 = non-missing)

# Compute summary statistics for paired differences: 
# What is the difference between the next 
# command and the one below it?

# paired.diff <- cholesterol.paired$DAY4-cholesterol.paired$DAY2
cholesterol.paired$paired.diff <- cholesterol.paired$DAY4-
                                  cholesterol.paired$DAY2
summary(cholesterol.paired$paired.diff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -108.00  -40.50  -14.00  -18.61    7.50   46.00
# - Histogram of paired differences:
hist(cholesterol.paired$paired.diff,freq=FALSE,
     main='4Days vs. 2Days Post-MI Differences',
     ylab='Density',
     xlab='Differences')

# - Boxplot of paired differences: 
boxplot(cholesterol.paired$paired.diff,
        main='4Days vs. 2Days Post-MI Differences',
        ylab='cholesterol levels')

# - Compute 95% CI for paired mean diff, test stat,and p-value:
t.test(cholesterol.paired$DAY4,cholesterol.paired$DAY2,paired=TRUE)
## 
##  Paired t-test
## 
## data:  cholesterol.paired$DAY4 and cholesterol.paired$DAY2
## t = -2.6529, df = 27, p-value = 0.0132
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -32.998364  -4.215922
## sample estimates:
## mean of the differences 
##               -18.60714
# OR Alternatively (and equivalently)
t.test(cholesterol.paired$paired.diff)
## 
##  One Sample t-test
## 
## data:  cholesterol.paired$paired.diff
## t = -2.6529, df = 27, p-value = 0.0132
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -32.998364  -4.215922
## sample estimates:
## mean of x 
## -18.60714
# - What if we (incorrectly) used the two sample t-test instead?
t.test(cholesterol.paired$DAY4,cholesterol.paired$DAY2)
## 
##  Welch Two Sample t-test
## 
## data:  cholesterol.paired$DAY4 and cholesterol.paired$DAY2
## t = -1.4828, df = 53.943, p-value = 0.1439
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -43.765617   6.551331
## sample estimates:
## mean of x mean of y 
##  235.3214  253.9286
mean(cholesterol.paired$DAY4)-mean(cholesterol.paired$DAY2)
## [1] -18.60714
########################################################
# Example: The Acupuncture Dataset -- needle.RData
# Key Variables
# group            = 0 is control, 1 is acupuncture
# pk1                = severity score (baseline)
# pk2                = severity score posttreatment
# pk5                = severity score (one year followup)
# ChangeFrom.pk1   = change in severity score (pk1-pk5)
# f1 heache frequency at baseline
# f1cat This is a categorized version of f1 with 
#      low = frequency <= 33rd percentile = 12
#      med = frequency > 33rd percentile = 12 
#            and <= 67th percentile = 19
#     high = frequency > 67th percentile = 19
#
########################################################

# Download and load the file
load(url("http://www.duke.edu/~sgrambow/crp245data/needle.RData"))
              
# Quick summary of key variables
summary(needle$pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.75   15.00   21.00   26.51   34.25   94.75
summary(needle$pk5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    8.50   15.00   19.08   25.25   87.25     100
summary(needle$ChangeFrom.pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -29.000  -0.500   6.250   6.486  11.750  55.750     100
# Quick summary of key variables by group
# Subset data into two treatment groups
Treat <- subset(needle,needle$fgroup=="Treat")
Control <- subset(needle,needle$fgroup=="Control")

# Summary of outcome by group at baseline
summary(Treat$pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.75   14.25   20.75   25.61   33.00   87.50
summary(Control$pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.75   16.00   21.88   27.45   35.69   94.75
# Summary of outcome by group at 12 months
summary(Treat$pk5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.25   12.00   16.25   21.50   73.73      44
summary(Control$pk5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.25   10.44   17.00   22.34   28.75   87.25      56
# Summary of Change from baseline in outcome by group
summary(Treat$ChangeFrom.pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -28.733   0.500   7.750   8.329  13.500  55.750      44
summary(Control$ChangeFrom.pk1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -29.000  -2.000   4.000   4.367  10.062  36.750      56
# - Boxpots of key variables by group:
# - Boxplot of pk1: 
boxplot(pk1~fgroup,data=needle,
        main='Boxplots of pk1 by Group',
        ylab='pk1 -- Baseline Score')

# - Boxplot of pk5: 
boxplot(pk5~fgroup,data=needle,
        main='Boxplots of pk5 by Group',
        ylab='pk5 -- 12 month Score')

# - Boxplot of ChangeFrom.pk1: 
boxplot(ChangeFrom.pk1~fgroup,data=needle,
        main='Boxplots of Change from Baseline to 12 months by Group',
        ylab='pk1 - pk5 Change from Baseline to 12 months')

############################
## Analysis
############################


####Change Score - Two Sample t-test assuming equal variances
t.test(needle$ChangeFrom.pk1~needle$fgroup,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  needle$ChangeFrom.pk1 by needle$fgroup
## t = -2.932, df = 299, p-value = 0.003628
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.621294 -1.302774
## sample estimates:
## mean in group Control   mean in group Treat 
##              4.367262              8.329296
mean(Treat$ChangeFrom.pk1,na.rm=TRUE)-mean(Control$ChangeFrom.pk1,na.rm=TRUE)
## [1] 3.962034
####Follow up score - Two Sample t-test assuming equal variances
t.test(needle$pk5~needle$fgroup,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  needle$pk5 by needle$fgroup
## t = 3.4399, df = 299, p-value = 0.0006648
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.608793 9.584530
## sample estimates:
## mean in group Control   mean in group Treat 
##              22.34345              16.24679
mean(Control$pk5,na.rm=TRUE)-mean(Treat$pk5,na.rm=TRUE)
## [1] 6.096661
####Recall Simple Linear Regression
lm.slr<-lm(needle$pk5~needle$group)     
summary(lm.slr)
## 
## Call:
## lm(formula = needle$pk5 ~ needle$group)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.093  -9.997  -4.497   5.907  64.907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    22.343      1.296   17.24  < 2e-16 ***
## needle$group   -6.097      1.772   -3.44 0.000665 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.34 on 299 degrees of freedom
##   (100 observations deleted due to missingness)
## Multiple R-squared:  0.03807,    Adjusted R-squared:  0.03485 
## F-statistic: 11.83 on 1 and 299 DF,  p-value: 0.0006648
confint(lm.slr)
##                 2.5 %    97.5 %
## (Intercept)  19.79257 24.894331
## needle$group -9.58453 -2.608793
####ANCOVA model
lm.ancova<-lm(needle$pk5~needle$pk1+needle$group)     
summary(lm.ancova)
## 
## Call:
## lm(formula = needle$pk5 ~ needle$pk1 + needle$group)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.881  -6.115  -1.059   5.946  43.041 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.45098    1.41672   2.436 0.015441 *  
## needle$pk1    0.70730    0.04055  17.444  < 2e-16 ***
## needle$group -4.58684    1.25177  -3.664 0.000294 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.81 on 298 degrees of freedom
##   (100 observations deleted due to missingness)
## Multiple R-squared:  0.5241, Adjusted R-squared:  0.5209 
## F-statistic: 164.1 on 2 and 298 DF,  p-value: < 2.2e-16
confint(lm.ancova)
##                   2.5 %     97.5 %
## (Intercept)   0.6629281  6.2390340
## needle$pk1    0.6275041  0.7870946
## needle$group -7.0502728 -2.1234090
# plotting the ancova model
#install.packages("HH")
library(HH)
## Warning: package 'HH' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: grid
## Loading required package: latticeExtra
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
## Loading required package: gridExtra

# plotting ancova
ancovaplot(pk5~pk1+fgroup,data=needle)

# check homogeneity of slopes -- include interaction
ancovaplot(pk5~pk1*fgroup,data=needle)

# End of Program