Biostatistics 213: Homework 7

Loading data

library(gdata)
icu <- read.xls("~/statistics/bio213/icu.xls")

## Correct variable names: po2 for pO2, pco2 for pCO2
names(icu) <- c("id","status","age","sex","race","service","cancer","renal","inf","cpr","sys","heart","prevad","type","frac","po2","ph","pco2","bic","cre","loc")

## Same data available online:
## Hosmer and Lemeshow (2000) Applied Logistic Regression
## http://www.umass.edu/statdata/statdata/data/index.html
## description: http://www.umass.edu/statdata/statdata/data/icu.txt
## text format: http://www.umass.edu/statdata/statdata/data/icu.dat
## Excel format: http://www.umass.edu/statdata/statdata/data/icu.xls

## icu <- read.table("http://www.umass.edu/statdata/statdata/data/icu.dat",
##                   skip = 8, col.names = c("id","status","age","sex","race","service","cancer","renal","inf","cpr","sys","heart","prevad","type","frac","po2","ph","pco2","bic","cre","loc")
##                   )
## OR
## icu <- read.xls("http://www.umass.edu/statdata/statdata/data/icu.xls")

Exploring data

## Number of subjects
nrow(icu)

## Check categorical and continuous variables (No need after categorical conversion)
lapply(icu, function(x) head(summary.factor(x), 20))  # First 20 levels

## Check number of unique values
n.unique.values <- sapply(icu, function(x) nlevels(factor(x)))
binary.variables <- names(icu)[n.unique.values == 2]
dput(binary.variables)

## summary
summary(icu)

## Number of complete cases: All 200 are complete cases
sum(complete.cases(icu))

## Frequencies
library(Deducer)
frequencies(icu[, !names(icu) %in% c("id","age","sys","heart")])

Recoding data

## Recode variables
icu <- within(icu, {
    status  <- factor(status,  levels = 0:1, labels = c("Lived","Died"))
    sex     <- factor(sex,     levels = 0:1, labels = c("Male","Female"))
    race    <- factor(race,    levels = 1:3, labels = c("White","Black","Other"))
    service <- factor(service, levels = 0:1, labels = c("Medical","Surgical"))
    type    <- factor(type,    levels = 0:1, labels = c("Elective","Emergency"))
    po2     <- factor(po2,     levels = 0:1, labels = c(">60","<=60"))
    ph      <- factor(ph,      levels = 0:1, labels = c(">=7.25","<7.25"))
    pco2    <- factor(pco2,    levels = 0:1, labels = c("<=45",">45"))
    bic     <- factor(bic,     levels = 0:1, labels = c(">=18","<18"))
    cre     <- factor(cre,     levels = 0:1, labels = c("<=2.0",">2.0"))
    loc     <- factor(loc,     levels = 0:2, labels = c("No Coma or Deep Stupor","Deep Stupor","Coma"))
})

## Recode No-Yes variables
no.yes.variables <- c("cancer","renal","inf","cpr","prevad","frac")
icu[,no.yes.variables] <-
    lapply(icu[,no.yes.variables],
           function(var) {
               var <- factor(var, levels = 0:1, labels = c("No","Yes"))
           })

summary(icu)
       id        status         age           sex         race         service    cancer    renal      inf     
 Min.   :  4   Lived:160   Min.   :16.0   Male  :124   White:175   Medical : 93   No :180   No :181   No :116  
 1st Qu.:210   Died : 40   1st Qu.:46.8   Female: 76   Black: 15   Surgical:107   Yes: 20   Yes: 19   Yes: 84  
 Median :412               Median :63.0                Other: 10                                               
 Mean   :445               Mean   :57.5                                                                        
 3rd Qu.:672               3rd Qu.:72.0                                                                        
 Max.   :929               Max.   :92.0                                                                        
  cpr           sys          heart       prevad           type      frac       po2           ph        pco2    
 No :187   Min.   : 36   Min.   : 39.0   No :170   Elective : 53   No :185   >60 :184   >=7.25:187   <=45:180  
 Yes: 13   1st Qu.:110   1st Qu.: 80.0   Yes: 30   Emergency:147   Yes: 15   <=60: 16   <7.25 : 13   >45 : 20  
           Median :130   Median : 96.0                                                                         
           Mean   :132   Mean   : 98.9                                                                         
           3rd Qu.:150   3rd Qu.:118.2                                                                         
           Max.   :256   Max.   :192.0                                                                         
   bic         cre                          loc     
 >=18:185   <=2.0:190   No Coma or Deep Stupor:185  
 <18 : 15   >2.0 : 10   Deep Stupor           :  5  
                        Coma                  : 10  



Overall summary

No missing data are present. There are few minority patients, cancer patients, chronic renal failu patients, patients who underwent CPR, long bone fracture patients, patients who had abnormal values in the ABG testing, and patients who had abnormal conciousness levels. It may be necessary to combine or omit some of these variables.

Pairs relationship

library(gpairs)
gpairs(icu)

plot of chunk unnamed-chunk-5

Continuous: age, sys, heart

Continuous variables were compared between the Lived and Dead groups. Mean (sd) and median [IQR] are shown for normal and non-normal variables, respectively.

Kurtosis was higher than 1 for age and systolic blood pressure in the Died group, thus, these variables were consided non-normal. heart variable was considered normal. Age was significantly higher for the Died group (Lived 61 [41.8, 71.0] vs Died 68 [56.5, 75.0], Wicoxon test, p = 0.011), systolic blood pressure was significantly higher for the Lived group (Lived 132 [112.0,154.0] vs Died 126 [89.0,140.0], Wicoxon test, p = 0.011). There was no significant difference between the groups for the heart rate variable (Lived 135.6 (29.8) vs Dead 118.8 (41.1), t-test, p = 0.655).

library(reshape2)
icu.cont.melt <- melt(icu[,c("status","age","sys","heart")])
library(ggplot2)
ggplot(icu.cont.melt, aes(x = status, y = value, color = variable)) +
    geom_boxplot() +
    facet_grid(variable ~ ., scales = "free")

plot of chunk unnamed-chunk-6

Check for normality

library(plyr)
normality.by.status <- dlply(icu,
                             "status",
                             function(df) {
                                 ..SeeNormality(df[,c("age","sys","heart")])
                             })
lapply(normality.by.status, invisible)
$Lived
      Number Not.NA   Mean     SD Skewness Kurtosis Median Quartile_25 Quartile_75          p.SW
age      160    160  55.65 20.428 -0.55895 -0.77710     61       41.75        71.0 0.00000026618
sys      160    160 135.64 29.802  0.42766  0.43244    132      112.00       154.0 0.01420516221
heart    160    160  98.50 26.979  0.44650  0.25011     95       80.00       116.5 0.07580061579

$Died
      Number Not.NA    Mean     SD Skewness Kurtosis Median Quartile_25 Quartile_75     p.SW
age       40     40  65.125 16.649 -0.91188  1.21558     68        56.5       75.00 0.036463
sys       40     40 118.825 41.081  0.64330  2.02097    126        89.0      140.00 0.058034
heart     40     40 100.625 26.493  0.30301 -0.54784     96        81.0      119.75 0.591875


## F-test
two.sample.test(d(heart) ~ d(status), data = icu, test = var.test)
                                          F test to compare two variances                                            
      ratio of variances 95% CI Lower 95% CI Upper     F (num df,denom df) p-value
heart              1.037      0.60472       1.6473 1.037          (159,39)   0.927
  HA: two.sided 
  H0:  ratio of variances = 1 
## t-test (equal variance)
two.sample.test(d(heart) ~ d(status), data = icu, test = t.test, var.equal = TRUE)
                                                  Two Sample t-test                                                  
      mean of Lived mean of Died Difference 95% CI Lower 95% CI Upper        t  df p-value
heart          98.5       100.62     -2.125      -11.497       7.2468 -0.44714 198 0.65526
  HA: two.sided 
  H0:  difference in means = 0 
## Wilcoxon test
two.sample.test(d(age,sys) ~ d(status), data = icu, test = wilcox.test, correct = FALSE)
                                               Wilcoxon rank sum test                                                
         W  p-value
age 2368.5 0.011077
sys 4036.5 0.010569
  HA: two.sided 
  H0:  location shift = 0 

Categorical: all others

Fisher exact test was conducted to test for significant departure from indendence.

For binary variables, the proportion of the “1” category is shown. Compared to the Died group, among the Lived group, there were significantly higher proportions of medical patients, non-chronic renal failure patients, non-infection patients, patients who did not undergo CPR, and elective admission patients. The distribution of conciousness status were different between the Lived and Died group. Among these variables, emergency use of ICU (p < 0.001) and undergoing CPR (p = 0.005) appear to be the most significant factors associated ICU mortality.

Proportions (%) of category 1 (0-1 coded variables) within each status stratum

binary.covariates <- c("sex","service","cancer","renal","inf","cpr","prevad","type","frac","po2","ph","pco2","bic","cre")
multicat.covariates <- c("race","loc")

tabs.bin.var <- ..multi.xtabs(df = icu,
                              vars = binary.covariates,
                              group.var = "status")

tab.bin.pval <- sapply(tabs.bin.var, function(var) fisher.test(var)$p.val)


## Binary variables
prop.tab.bin <- lapply(tabs.bin.var,
                       function(tab) {
                           prop.table(tab, margin = 2)[2,]
                       }
                       )

prop.tab.bin <- do.call(rbind, prop.tab.bin)
cbind(round(prop.tab.bin * 100, 1), "Fisher test p-value" = round(tab.bin.pval, 3))
        Lived Died Fisher test p-value
sex      37.5 40.0               0.856
service  58.1 35.0               0.013
cancer   10.0 10.0               1.000
renal     6.9 20.0               0.029
inf      37.5 60.0               0.012
cpr       3.8 17.5               0.005
prevad   14.4 17.5               0.624
type     68.1 95.0               0.000
frac      7.5  7.5               1.000
po2       6.9 12.5               0.324
ph        5.6 10.0               0.297
pco2     10.0 10.0               1.000
bic       6.2 12.5               0.187
cre       3.1 12.5               0.029

Proportions (%) of all categories within each status stratum

## Multi-category variables
tabs.multi.var <- ..multi.xtabs(df = icu,
                              vars = multicat.covariates,
                              group.var = "status")

lapply(tabs.multi.var, function(tab) {
    res.tab <- round(prop.table(tab, margin = 2) * 100, 1)
    res.fisher <- fisher.test(tab)$p.val
    list("Proportion table" = res.tab, "Fisher test p-value" = res.fisher)
})
$race
$race$`Proportion table`
       status
race    Lived Died
  White  86.2 92.5
  Black   8.8  2.5
  Other   5.0  5.0

$race$`Fisher test p-value`
[1] 0.49951


$loc
$loc$`Proportion table`
                        status
loc                      Lived Died
  No Coma or Deep Stupor  98.8 67.5
  Deep Stupor              0.0 12.5
  Coma                     1.2 20.0

$loc$`Fisher test p-value`
[1] 0.000000005737

Univariable logistic regression models at once

lapply(c("age","sex","race","service","cancer","renal","inf","cpr","sys","heart","prevad","type","frac","po2","ph","pco2","bic","cre","loc"),
       function(var) {
           formula <- as.formula(paste("status ~", var))
           res.logist<- glm(formula, data = icu, family = binomial)
           ..glm.ratio(res.logist)
       })
[[1]]
              OR 2.5 % 97.5 %
(Intercept) 0.05  0.01   0.17
age         1.03  1.01   1.05

[[2]]
              OR 2.5 % 97.5 %
(Intercept) 0.24  0.15   0.37
sexFemale   1.11  0.54   2.24

[[3]]
              OR 2.5 % 97.5 %
(Intercept) 0.27  0.18   0.38
raceBlack   0.27  0.01   1.39
raceOther   0.93  0.14   3.92

[[4]]
                  OR 2.5 % 97.5 %
(Intercept)     0.39  0.24   0.60
serviceSurgical 0.39  0.18   0.79

[[5]]
              OR 2.5 % 97.5 %
(Intercept) 0.25  0.17   0.36
cancerYes   1.00  0.27   2.93

[[6]]
              OR 2.5 % 97.5 %
(Intercept) 0.21  0.14   0.31
renalYes    3.39  1.22   9.06

[[7]]
              OR 2.5 % 97.5 %
(Intercept) 0.16  0.09   0.26
infYes      2.50  1.24   5.16

[[8]]
              OR 2.5 % 97.5 %
(Intercept) 0.21  0.14   0.31
cprYes      5.44  1.70  17.94

[[9]]
              OR 2.5 % 97.5 %
(Intercept) 2.18  0.50   9.92
sys         0.98  0.97   0.99

[[10]]
              OR 2.5 % 97.5 %
(Intercept) 0.19  0.05   0.70
heart       1.00  0.99   1.02

[[11]]
              OR 2.5 % 97.5 %
(Intercept) 0.24  0.16   0.35
prevadYes   1.26  0.47   3.07

[[12]]
                OR 2.5 % 97.5 %
(Intercept)   0.04  0.01   0.13
typeEmergency 8.89  2.58  55.96

[[13]]
              OR 2.5 % 97.5 %
(Intercept) 0.25  0.17   0.35
fracYes     1.00  0.22   3.34

[[14]]
              OR 2.5 % 97.5 %
(Intercept) 0.23  0.16   0.34
po2<=60     1.94  0.58   5.70

[[15]]
              OR 2.5 % 97.5 %
(Intercept) 0.24  0.16   0.34
ph<7.25     1.86  0.48   6.08

[[16]]
              OR 2.5 % 97.5 %
(Intercept) 0.25  0.17   0.36
pco2>45     1.00  0.27   2.93

[[17]]
              OR 2.5 % 97.5 %
(Intercept) 0.23  0.16   0.33
bic<18      2.14  0.63   6.44

[[18]]
              OR 2.5 % 97.5 %
(Intercept) 0.23  0.15   0.32
cre>2.0     4.43  1.17  16.74

[[19]]
                        OR 2.5 % 97.5 %
(Intercept)           0.17  0.11   0.25
locDeep Stupor 91589444.64  0.00     NA
locComa              23.41  5.52 160.80

Similar way to do it in SAS

/*Looping over variables to perform multiple univariate logistic regression*/
/*http://www.sas.com/offices/europe/uk/support/sas-hints-tips/ht1_jan04.html*/

%macro logistrepeat(values);
    %local i j ;                         %* i for variable name, j for position;
    %let j = 1 ;
    %do %while(%scan(&values, &j) ne ) ; %* <== repeat while jth value is NotEqual empty ;
        %let i = %scan(&values, &j) ;    %* <== Insert jth element to variable i;

        /* Logistic regression procedure below */
          Title "Model with -- &i -- as predictor";

          proc logist descending data= icu;
          model status = &i;     %* <== Variable name inserted here;
          run;

         %let j = %eval(&j+1) ;          %* <== Increase the value of j by 1;
    %end ;
%mend ;

/* How to run it*/
%logistrepeat(age sex race service cancer renal inf cpr sys heart prevad type frac p02 ph pc02 bic crit loc);