# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 1 Day 3 ~
# ~ Creating Data from a   ~
# ~ Manuscript to check    ~
# ~ Results                ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Nebulized Hypertonic Saline for Bronchiolitis
# A Randomized Clinical Trial
# Author: Wu et al.
# Citation:
# JAMA Pediatrics, 168(7), 657-63. 
# URL: https://doi.org/10.1001/jamapediatrics.2014.301

# We want to verify the primary unadjusted analysis
# for admission rate (out of admitted/not admitted)
# Odds ratio of admission (HS Group vs NS Group)
# OR = 0.55 (95% CI, 0.36-0.83)
# meaning that odds of admission are 45% less in HS
# group than the NS group.
# Creating Dataset
# Let's create a dataset that contains the following variables:
#    patientID -- ID number from 1 to 408
#    admitted -- 0=Not Admitted, 1=Admitted
#    trtgroup -- 0=NS Group; 1=HS Group
# 
# create a sequence of integers from 1 to 408
patientID <- seq(1,408,1)
# Total in NS Group = 197
# NS Admitted = 84 (42.6%)
# NS Not Admitted = 197-84 = 113

# Total in HS Group = 211
# HS Admitted = 61 (28.9%)
# HS Not Admitted = 211-61 = 150

# STEP 1:
# Make a vector with 145 ones (84NS+61HS=145)
# representing the admitted patients in
# both of the treatment groups. 
# STEP 2: 
# Make a vector with 263 zeroes (113NS+150HS=263)
# representing everyone else in the sample, 
# all of whom were not admitted.
# STEP 3: 
# Let's combine these using the c() function
# into a single vector called 'admitted'
# of length 408 equal to the total sample size

admitted <- c(rep(1,145),rep(0,263))

# STEP 3:
# The next variable to create is the treatment 
# group variable which we will call trtgroup.
# we need a series of vectors representing
# those in each group who were admitted and 
# not admitted.
# STEP 3a:
# Let's make trtgroup=0 be the NS group
#            trtgroup=1 be the HS group
# For the admitted (n=145) we have
# n=84 in the nsgroup and n=61 in the hsgroup
# We can represent this by two vectors
# for the NS group (trtgroup=0) -- rep(0,84)
# for the HS group (trtgroup=1) -- rep(1,61)

# For those NOT admitted (n=263) we have
# n=113 n the nsgroup and n=150 in the hsgroup
# We can represent this by two vectors
# for the NS group (trtgroup=0) -- rep(0,113)
# for the HS group (trtgroup=1) -- rep(1,150)

# Now let's connect these vectors into one 
# variable that matches the admitted variable
# created above in the following order:
# HS group admitted, NS group admitted, NS group not admitted,
# and HS group not admitted

hsgroup <- c(rep(0,84),rep(1,61),rep(0,113),rep(1,150))

# Lastly, pull these three vectors we have created
# together into a data frame.
hyper <- data.frame(patientID,admitted,hsgroup)
#
#
# Now let's summaryize/visualize the data

str(hyper)
## 'data.frame':    408 obs. of  3 variables:
##  $ patientID: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ admitted : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ hsgroup  : num  0 0 0 0 0 0 0 0 0 0 ...
summary(hyper)
##    patientID        admitted         hsgroup      
##  Min.   :  1.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:102.8   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :204.5   Median :0.0000   Median :1.0000  
##  Mean   :204.5   Mean   :0.3554   Mean   :0.5172  
##  3rd Qu.:306.2   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :408.0   Max.   :1.0000   Max.   :1.0000
# Recall the table1 package from CRP241
# install.packages("table1") # only run this if you need to

library(table1)
## Warning: package 'table1' was built under R version 4.0.3
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table.data <- hyper

# we need to change our variables to factors for 
# compatibility with table1 package

table.data$hsgroup <- factor(table.data$hsgroup,
                             labels = c("Normal","Hypertonic"))

table.data$admitted <- factor(table.data$admitted,
                               labels = c("NO","YES"))

# We can create a nicely formatted table with 
# a single function call to table1

table1(~ admitted | hsgroup, data=table.data)
Normal
(N=197)
Hypertonic
(N=211)
Overall
(N=408)
admitted
NO 113 (57.4%) 150 (71.1%) 263 (64.5%)
YES 84 (42.6%) 61 (28.9%) 145 (35.5%)
#
#
# Unadjusted Analyses using Chi-square
# lets create the 2 x 2 outcome table
# for admission variable
admission.table <- table(admitted,hsgroup)
rownames(admission.table) <- c("NO","YES")
colnames(admission.table) <- c("NS","HS")
admission.table
##         hsgroup
## admitted  NS  HS
##      NO  113 150
##      YES  84  61
# what if we wanted to create this by hand?
byhand.table <- rbind(c(113,150),c(84,61))
rownames(byhand.table) <- c("NO","YES")
colnames(byhand.table) <- c("NS","HS")
byhand.table
##      NS  HS
## NO  113 150
## YES  84  61
# Now let's analyze the 2 x 2 table 
# using the 1st table we created (admissioni.table)
# Here is the standard Pearson Chi-Square Test
chisq.test(admission.table,correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  admission.table
## X-squared = 8.3831, df = 1, p-value = 0.003787
# We can use epitools to calculate the Odds Ratio (OR)
# remember that you may need to run the 
# install.packages function if you haven't installed
# epitools before
# install.packages("epitools")
library(epitools)
## Warning: package 'epitools' was built under R version 4.0.3
oddsratio(admission.table)
## $data
##         hsgroup
## admitted  NS  HS Total
##    NO    113 150   263
##    YES    84  61   145
##    Total 197 211   408
## 
## $measure
##         odds ratio with 95% C.I.
## admitted  estimate     lower     upper
##      NO  1.0000000        NA        NA
##      YES 0.5482102 0.3623922 0.8253729
## 
## $p.value
##         two-sided
## admitted  midp.exact fisher.exact  chi.square
##      NO           NA           NA          NA
##      YES 0.003933746  0.005114295 0.003787306
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# Odds ratio of admission (HS Group vs NS Group)
# OR = 0.55 (95% CI, 0.36-0.83)
# meaning that odds of admission are 45% less in HS
# group than the NS group.

# We can also generate the unadjusted odds ratio using 
# logistic regression:
hyper.fit <- glm(admitted ~ hsgroup, data=hyper, family='binomial')
summary(hyper.fit)
## 
## Call:
## glm(formula = admitted ~ hsgroup, family = "binomial", data = hyper)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0543  -1.0543  -0.8261   1.3057   1.5754  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.2966     0.1441  -2.059  0.03953 * 
## hsgroup      -0.6032     0.2093  -2.882  0.00396 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 530.99  on 407  degrees of freedom
## Residual deviance: 522.58  on 406  degrees of freedom
## AIC: 526.58
## 
## Number of Fisher Scoring iterations: 4
# This yields the coefficient table:
# Coefficients:
#             Estimate Std. Error z value Pr(>|z|)   
# (Intercept)  -0.2966     0.1441  -2.059  0.03953 * 
# nsgroup      -0.6032     0.2093  -2.882  0.00396 **

# We can exponentiate to generate the desired odds ratio
exp(hyper.fit$coefficients)
## (Intercept)     hsgroup 
##   0.7433628   0.5470635
# This yields:
# (Intercept)     hsgroup 
#   0.7433628   0.5470635
#
# We can exponentiate the confidence interval to get the 
# limits on the OR:
exp(confint(hyper.fit))
## Waiting for profiling to be done...
##                 2.5 %    97.5 %
## (Intercept) 0.5591328 0.9844228
## hsgroup     0.3619429 0.8229722
# This yields:
#                   2.5 %    97.5 %
#   (Intercept) 0.5591328 0.9844228
#   hsgroup     0.3619429 0.8229722
#
# Thus, we have the unadjusted OR from the 
# paper:
# OR = 0.55 with 95% CI = [0.36, 0.82]
#
# What about predictions?
predict(hyper.fit,data.frame(hsgroup=0),type="response")
##         1 
## 0.4263959
predict(hyper.fit,data.frame(hsgroup=1),type="response")
##         1 
## 0.2890995
# Estimated Probability Plot
#fancier plot with ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
ggplot(hyper, aes(x=hsgroup, y=admitted)) + geom_point() +
  ggtitle("Plot of Admission Probability by Treatment Group") +
  scale_y_continuous(name="Probability of Admitted",breaks = seq(0,1,.1)) +
  scale_x_continuous(name="Treatment Group -- 0=HS -- 1=NS",breaks=seq(0,1,1)) +
  stat_smooth(method="glm", method.args=list(family="binomial"), se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

# NOW LETS LOOK AT LOS OUTCOME and ANALYSIS
# looking at LOS
# From Abstract
# Mean (SD) LOS for NS 3.92 (5.24) [n=84]
# Mean (SD) LOS for HS 3.16 (2.11) [n=61]
# Note that 4 participants were missing LOS data
# so sample size is slightly off
# We can calculate t-test using summary statistics
# via a customized function in R
# m1, m2: the sample means
# s1, s2: the sample standard deviations
# n1, n2: the same sizes
# m0: the null value for the difference in means to be tested for. Default is 0. 
# equal.variance: whether or not to assume equal variance. Default is FALSE. 
t.test2 <- function(m1,m2,s1,s2,n1,n2,m0=0,equal.variance=FALSE)
{
  if( equal.variance==FALSE ) 
  {
    se <- sqrt( (s1^2/n1) + (s2^2/n2) )
    # welch-satterthwaite df
    df <- ( (s1^2/n1 + s2^2/n2)^2 )/( (s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1) )
  } else
  {
    # pooled standard deviation, scaled by the sample sizes
    se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*s1^2 + (n2-1)*s2^2)/(n1+n2-2) ) 
    df <- n1+n2-2
  }      
  t <- (m1-m2-m0)/se 
  dat <- c(m1-m2, se, t, 2*pt(-abs(t),df))    
  names(dat) <- c("Difference of means", "Std Error", "t", "p-value")
  return(dat) 
}
# substituting in our values
ttest.los <- t.test2(3.92,3.16,5.24,2.11,84,61,equal.variance=TRUE)
ttest.los
## Difference of means           Std Error                   t             p-value 
##           0.7600000           0.7098224           1.0706903           0.2861125
# yields
#    Difference of means           Std Error 
#            0.7600000           0.7098224 
#                    t             p-value 
#            1.0706903           0.2861125 
# can also do this using web-based calculator like
# https://www.graphpad.com/quickcalcs/ttest1/
# End of Program