# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 | Module 4 Day 1 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Birth Weight Data Example ---------------------------------

# The goal of this study was to identify risk factors 
# associated with giving birth to a low birth weight baby 
# (weighing less than 2500 grams). Data was collected on 
# 189 women at a single US birthing center. Variables which 
# were thought to be of importance to infant birth weight
# were age of the subject, weight of the subject at her 
# last menstrual period, race, the subjects smoking status 
# during pregnancy, and the number of clinic (MD) visits 
# during the first trimester.
# 
# Data Dictionary: 
# ID      Identification Code
# LOW     Low Birth Weight (0=Birth Weight>=2500g;1=Birth Weight<2500g)
# AGE     Age of the Mother in Years                           
# LWT     Weight in Pounds at the Last Menstrual Period 
# RACE    Race (1=White,2=Black,3=Other)            
# SMOKE   Smoking Status During Pregnancy (1=Yes,0=No)
# PTL     History of Premature Labor (0=None,1=One, etc.) 
# HT      History of Hypertension (1=Yes,0=No)              
# UI      Presence of Uterine Irritability (1=Yes,0=No) 
# FTV     Number of MD Visits During the First Trimester (0=None,1=One,2=Two,etc)
# BWT     Birth Weight in Grams   

## Download and load the Birth Weight Data dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/BirthWeightData.RData",
              destfile = "BirthWeightData.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("BirthWeightData.RData")

# Create subset of mothers who smoke:
smoke <- subset(bwdata,bwdata$SMOKE==1)

# Compute descriptive statistics and check assumptions
# Note: Birthweight data is in grams, but the 
#       hypothesis test is in pounds. Need to 
#       convert --> 1 pound = 453.592 grams. 
summary(smoke$BWT/453.592)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.563   5.226   6.119   6.114   7.156   9.343
# Histogram of birth weights: 
hist(smoke$BWT/453.592,freq=FALSE,
     main='Birth Weight Among Mothers Who Smoke',
     xlab='Birth Weight (lbs)')
box()

# Boxplot of birth weights: 
boxplot(smoke$BWT/453.592,
        main='Birth Weight of Babies Among Mothers Who Smoke',
        ylab='Birth Weight (lbs)')

# One Sample T-test for the Mean:
# (1) Compute 95% CI for the mean
t.test(smoke$BWT/453.592)
## 
##  One Sample t-test
## 
## data:  smoke$BWT/453.592
## t = 36.142, df = 73, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  5.776813 6.451106
## sample estimates:
## mean of x 
##   6.11396
# (2) Compute test stat and p-value for H1: mu < 7
t.test(smoke$BWT/453.592,alternative="less",mu=7)
## 
##  One Sample t-test
## 
## data:  smoke$BWT/453.592
## t = -5.2377, df = 73, p-value = 7.551e-07
## alternative hypothesis: true mean is less than 7
## 95 percent confidence interval:
##      -Inf 6.395789
## sample estimates:
## mean of x 
##   6.11396
# Heart Failure Trial Data Example --------------------------

# Current AMA guidelines state that exercise is beneficial 
# for patients with heart failure. However, there is limited
# clinical data to support this statement, especially among 
# patients with chronic heart failure. Previously published 
# studies have found evidence of a relationship between 
# improvements in exercise tolerance and physiologic markers, 
# suggesting potential mechanisms by which exercise training 
# might improve survival and decrease morbidity. 2275 patients 
# were randomized at 82 centers across the US in a 1:1 ratio
# to either an exercise training arm or to a usual care arm. 
# Among many outcomes measured to evaluate the effects of 
# exercise training on exercise tolerance, the trial measures 
# peak oxygen uptake (peak Vo2) at baseline and at 3, and 12 
# months post-randomization.
# 
# Data Dictionary: 
# SUBJNO      Subject ID number
# TRT         Randomize treatment arm [Usual Care; Exercise]
# ETIOLOGY    Ischemic or non-ischemic etiology [1=Ischemic; 2=Non-ischemic]
# NONADHERE   Non-adherence to treatment randomization [1=Yes; 0=NO]
# NYHACL      NHYA class at randomization [2=II; 3=III; 4=IV]
# BESTLVEF    Best available baseline LVEF
# AGE         Age at randomization (years)
# BMI         Body mass index at randomization
# SEX         Subject sex [1=male; 2=female]
# RACE        Self-reported race [1=black; 2=white; 3=other]
# DIABETES    History of diabetes [1=Yes; 0=No]
# PEAKVO2B    Peak VO2 on baseline CPX test (mL/kg/min)
# PEAKVO23    Peak VO2 on 3-month CPX test (mL/kg/min)
# PEAKVO212   Peak VO2 on 12-month CPX test (mL/kg/min)

## Download and load the Heart Failure Trial Data dataset: 
download.file("http://www.duke.edu/~sgrambow/crp241data/HFCT_Data.RData",
              destfile = "HFCT_Data.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("HFCT_Data.RData")

# Create subset for each treatment arm:
usual    <- subset(hfdata,hfdata$TRT=='Usual Care')
exercise <- subset(hfdata,hfdata$TRT=='Exercise')

# Compute descriptive statistics and check assumptions by treatment arm: 
summary(usual$PEAKVO212)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.20   12.20   15.40   16.04   19.30   38.80     467
summary(exercise$PEAKVO212)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.40   12.28   15.50   16.07   18.82   37.80     401
# Boxplots of cholesterol by case-control status
boxplot(hfdata$PEAKVO212~hfdata$TRT,
        main='Peak Vo2 Levels at 12 Months by Treatment Arm',
        ylab='Peak Vo2 on 12-month CPX test (mL/kg/min)')

# Two Sample T-test for Means:
# (1) Compute mean difference between two groups:
#     (i.e. the point estimate)
mean(exercise$PEAKVO212,na.rm=TRUE)-mean(usual$PEAKVO212,na.rm=TRUE)
## [1] 0.03914795
# (2) Compute 95% CI, test statistic, and p-value for 
#   difference in means 
# Option 1: Use ~ syntax
t.test(hfdata$PEAKVO212~hfdata$TRT)
## 
##  Welch Two Sample t-test
## 
## data:  hfdata$PEAKVO212 by hfdata$TRT
## t = 0.14527, df = 1394.9, p-value = 0.8845
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4894757  0.5677716
## sample estimates:
##   mean in group Exercise mean in group Usual Care 
##                 16.07433                 16.03519
# Option 2: Use , syntax
t.test(exercise$PEAKVO212,usual$PEAKVO212)
## 
##  Welch Two Sample t-test
## 
## data:  exercise$PEAKVO212 and usual$PEAKVO212
## t = 0.14527, df = 1394.9, p-value = 0.8845
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4894757  0.5677716
## sample estimates:
## mean of x mean of y 
##  16.07433  16.03519
#   - Note: Default in R is to assume variances are 
#          NOT equal, but can change that!
t.test(hfdata$PEAKVO212~hfdata$TRT,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  hfdata$PEAKVO212 by hfdata$TRT
## t = 0.14522, df = 1405, p-value = 0.8846
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4896537  0.5679496
## sample estimates:
##   mean in group Exercise mean in group Usual Care 
##                 16.07433                 16.03519
# End of Program