BIO 206 SAS codes translated to R

Format RMarkdown, read data from Excel file.

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, 
    tidy = FALSE, fig.width = 8, fig.height = 7)
options(width = 116, scipen = 10)

setwd("~/statistics/bio206/")

Data check

## Read xls file
library(XLConnect)
wb             <- loadWorkbook("~/statistics/bio208/survses.xls")
survses        <- readWorksheet(wb, sheet = 1)
names(survses) <- tolower(names(survses))


## Check variables
str(survses)
'data.frame':   2094 obs. of  18 variables:
 $ sex    : num  1 1 2 2 1 1 2 2 1 1 ...
 $ race   : num  1 1 1 1 1 1 1 1 1 1 ...
 $ ips    : num  1 0 1 1 0 0 0 0 1 1 ...
 $ educat : num  NA NA NA NA NA NA NA NA NA NA ...
 $ income : num  NA NA NA NA NA NA NA NA NA NA ...
 $ study  : num  7751 7751 7751 7751 7751 ...
 $ age    : num  30 37 NA 32 20 34 16 46 30 32 ...
 $ marital: num  NA NA NA NA NA NA NA NA NA NA ...
 $ tension: num  NA NA NA NA NA NA NA NA NA NA ...
 $ depress: num  NA NA NA NA NA NA NA NA NA NA ...
 $ anger  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ vigor  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ fatigue: num  NA NA NA NA NA NA NA NA NA NA ...
 $ confuse: num  NA NA NA NA NA NA NA NA NA NA ...
 $ total  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ time   : num  2469 1271 2256 2286 2245 ...
 $ dead   : num  0 1 0 0 0 0 0 0 1 0 ...
 $ sick   : num  0 0 0 0 0 0 0 0 0 0 ...

## proc freq can be simulated with library(gmodels)'s CrossTable()
library(gmodels)
CrossTable(x = survses$sex)


   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|


Total Observations in Table:  2094 


          |         1 |         2 | 
          |-----------|-----------|
          |      1324 |       770 | 
          |     0.632 |     0.368 | 
          |-----------|-----------|





## Working on multiple variables is rather awkward
junk <- with(survses, {
    lapply(list(sex, race, ips),
           CrossTable
           )
})


   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|


Total Observations in Table:  2094 


          |         1 |         2 | 
          |-----------|-----------|
          |      1324 |       770 | 
          |     0.632 |     0.368 | 
          |-----------|-----------|






   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|


Total Observations in Table:  2093 


          |         1 |         2 |         3 | 
          |-----------|-----------|-----------|
          |      1810 |       247 |        36 | 
          |     0.865 |     0.118 |     0.017 | 
          |-----------|-----------|-----------|






   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|


Total Observations in Table:  2044 


          |         0 |         1 |         2 |         3 |         4 | 
          |-----------|-----------|-----------|-----------|-----------|
          |       547 |       813 |       407 |       208 |        69 | 
          |     0.268 |     0.398 |     0.199 |     0.102 |     0.034 | 
          |-----------|-----------|-----------|-----------|-----------|





## Check normality
mean(survses$age, na.rm = TRUE)
[1] 59.23
sd(survses$age, na.rm = TRUE)
[1] 10.63
median(survses$age, na.rm = TRUE)
[1] 60

## quantiles
quantile(survses$age, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
  0%  25%  50%  75% 100% 
  15   54   60   67   91 

## For skewness and kurtosis use library(e1071)
library(e1071)
skewness(survses$age, na.rm = TRUE, type = 2)
[1] -0.9579
kurtosis(survses$age, na.rm = TRUE, type = 2)
[1] 1.841

## If you need to do it at once, library(psych)'s describe() is an option
library(psych)
psych::describe(survses, type = 2)
        var    n    mean     sd median trimmed    mad  min  max range  skew kurtosis    se
sex       1 2094    1.37   0.48      1    1.33   0.00    1    2     1  0.55    -1.70  0.01
race      2 2093    1.15   0.40      1    1.04   0.00    1    3     2  2.68     6.80  0.01
ips       3 2044    1.24   1.06      1    1.13   1.48    0    4     4  0.71    -0.10  0.02
educat    4 1043    1.91   0.79      2    1.84   1.48    1    4     3  0.67     0.16  0.02
income    5  931    2.14   0.92      2    2.05   1.48    1    4     3  0.46    -0.61  0.03
study     6 2094 7872.26 130.82   7782 7860.23  31.13 7751 8083   332  0.60    -1.41  2.86
age       7 2093   59.23  10.63     60   60.05  10.38   15   91    76 -0.96     1.84  0.23
marital   8 1068    2.44   1.10      2    2.26   0.00    1    5     4  1.47     0.96  0.03
tension   9  944   13.00   7.48     11   12.34   7.41    1   36    35  0.74    -0.15  0.24
depress  10  846   11.97   9.98      9   10.50   7.41    1   56    55  1.32     1.56  0.34
anger    11  739    7.73   6.83      6    6.58   4.45    1   45    44  1.73     3.66  0.25
vigor    12  937   13.70   6.50     14   13.54   7.41    1   32    31  0.22    -0.50  0.21
fatigue  13  880   10.87   6.96      9   10.30   7.41    1   30    29  0.64    -0.49  0.23
confuse  14  918    7.43   4.89      6    6.87   4.45    1   28    27  1.04     0.98  0.16
total    15  921   37.06  36.97     27   32.04  26.69    1  612   611  4.85    63.36  1.22
time     16 2094  498.61 490.86    325  414.18 306.90    0 2570  2570  1.58     2.17 10.73
dead     17 2094    0.80   0.40      1    0.88   0.00    0    1     1 -1.52     0.32  0.01
sick     18 2044    0.33   0.47      0    0.29   0.00    0    1     1  0.70    -1.51  0.01

## Hmisc's describe() gives quantiles
library(Hmisc)
Hmisc::describe(survses[,c("age","depress","confuse")], type = 2)
survses[, c("age", "depress", "confuse")] 

 3  Variables      2094  Observations
--------------------------------------------------------------------------------------------------------------------
age 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
   2093       1      70   59.23      41      46      54      60      67      71      74 

lowest : 15 16 17 18 19, highest: 82 83 85 86 91 
--------------------------------------------------------------------------------------------------------------------
depress 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
    846    1248      49   11.97       1       2       4       9      17      27      32 

lowest :  1  2  3  4  5, highest: 45 47 53 55 56 
--------------------------------------------------------------------------------------------------------------------
confuse 
      n missing  unique    Mean     .05     .10     .25     .50     .75     .90     .95 
    918    1176      26   7.432       1       2       4       6      10      14      17 

lowest :  1  2  3  4  5, highest: 22 24 25 26 28 
--------------------------------------------------------------------------------------------------------------------

Graphics

## Traditional scatter plot
plot(data = survses, tension ~ depress)

plot of chunk unnamed-chunk-3


## ggplot2 is another graphical package
library(ggplot2)
## quick plot
qplot(data = survses, y = tension, x= depress)

plot of chunk unnamed-chunk-3

## same thing with more explicit grammer
## ggplot(data = survses) + geom_point(aes(y = tension, x = depress))

## Histogram for age
hist(survses$age)

plot of chunk unnamed-chunk-3


## Density plot for age with ggplot2
qplot(data = survses, x = age, geom = "density")

plot of chunk unnamed-chunk-3


## Pie chart. Create table first, then plot with pie()
tab.race <- table(survses$race)
pie(tab.race)

plot of chunk unnamed-chunk-3


## Boxplot
boxplot(survses$vigor)

plot of chunk unnamed-chunk-3


## Boxplot with ggplot2
ggplot(survses) + geom_boxplot(aes(y = vigor, x = "vigor"))

plot of chunk unnamed-chunk-3


## qunatile vs quantile plot for normality
library(car)
qqPlot(survses$vigor)

plot of chunk unnamed-chunk-3



## Bar chart side by side
ggplot(survses) +
    geom_bar(aes(x = factor(sex), fill = factor(race)), position = "dodge")

plot of chunk unnamed-chunk-3


## Bar chart stacked
ggplot(survses) +
    geom_bar(aes(x = factor(sex), fill = factor(race)), position = "stack") +
    guides(fill = guide_legend(reverse = TRUE))

plot of chunk unnamed-chunk-3


## Bar chart stacked (proportion)
ggplot(survses) +
    geom_bar(aes(x = factor(sex), fill = factor(race)), position = "fill") +
    ylab("proportion") + guides(fill = guide_legend(reverse = TRUE))

plot of chunk unnamed-chunk-3

t-test, Wilcoxon rank sum test, and ANOVA

## Recoding 0,1 to 0; 2,3,4 to 1, then name the new variable sick
library(car)
survses <- within(survses, {
    sick <- recode(ips, recodes = "c(0,1) = 0; c(2,3,4) = 1")
})

## t-test: one at a time
t.test(data = survses, tension ~ sick)

    Welch Two Sample t-test

data:  tension by sick 
t = -3.363, df = 512.3, p-value = 0.0008291
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -2.9181 -0.7659 
sample estimates:
mean in group 0 mean in group 1 
          12.45           14.29 

t.test(data = survses, total ~ sick)

    Welch Two Sample t-test

data:  total by sick 
t = -4.726, df = 599.6, p-value = 0.000002859
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -16.927  -6.988 
sample estimates:
mean in group 0 mean in group 1 
          33.48           45.44 



## Probably no simple way to perform multiple tests
## multi.tests() function defined here
multi.tests <- function(fun = t.test, df, vars, group.var, ...) {
    sapply(simplify = FALSE,
           vars,
           function(var) {
               formula <- as.formula(paste(var, "~", group.var))
               fun(data = df, formula, ...)
           }
           )
}

## Multiple t-tests
res.multi.t.tests <-
    multi.tests(fun = t.test,
                df = survses,
                vars = c("tension","depress","anger","vigor","fatigue","confuse","total"),
                group.var = "sick",
                var.equal = TRUE)
res.multi.t.tests
$tension

    Two Sample t-test

data:  tension by sick 
t = -3.511, df = 927, p-value = 0.0004678
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -2.8716 -0.8124 
sample estimates:
mean in group 0 mean in group 1 
          12.45           14.29 


$depress

    Two Sample t-test

data:  depress by sick 
t = -3.334, df = 830, p-value = 0.0008927
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -3.881 -1.005 
sample estimates:
mean in group 0 mean in group 1 
          11.22           13.66 


$anger

    Two Sample t-test

data:  anger by sick 
t = -0.9467, df = 728, p-value = 0.3441
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -1.5820  0.5526 
sample estimates:
mean in group 0 mean in group 1 
          7.596           8.111 


$vigor

    Two Sample t-test

data:  vigor by sick 
t = 7.813, df = 921, p-value = 0.00000000000001522
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 2.617 4.373 
sample estimates:
mean in group 0 mean in group 1 
          14.79           11.30 


$fatigue

    Two Sample t-test

data:  fatigue by sick 
t = -6.324, df = 864, p-value = 0.0000000004085
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -4.080 -2.147 
sample estimates:
mean in group 0 mean in group 1 
           9.89           13.00 


$confuse

    Two Sample t-test

data:  confuse by sick 
t = -4.441, df = 901, p-value = 0.00001004
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -2.2179 -0.8585 
sample estimates:
mean in group 0 mean in group 1 
          6.946           8.484 


$total

    Two Sample t-test

data:  total by sick 
t = -4.546, df = 905, p-value = 0.000006195
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -17.119  -6.796 
sample estimates:
mean in group 0 mean in group 1 
          33.48           45.44 


## p-values can be extracted from the result object
data.frame(p.value = sapply(res.multi.t.tests, getElement, name = "p.value"))
                    p.value
tension 0.00046779024854653
depress 0.00089265010722945
anger   0.34410112363187073
vigor   0.00000000000001522
fatigue 0.00000000040851219
confuse 0.00001004401067984
total   0.00000619451833615

## Multiple Wilcoxon's rank sum tests
res.multi.wilcox.tests <-
    multi.tests(fun = wilcox.test,
                df = survses,
                vars = c("tension","depress"),
                group.var = "sick")
res.multi.wilcox.tests
$tension

    Wilcoxon rank sum test with continuity correction

data:  tension by sick 
W = 81298, p-value = 0.001758
alternative hypothesis: true location shift is not equal to 0 


$depress

    Wilcoxon rank sum test with continuity correction

data:  depress by sick 
W = 64933, p-value = 0.0003463
alternative hypothesis: true location shift is not equal to 0 


## ANOVA: Make sure the grouping variable is a factor (categorical variable in R)
anova(lm(data = survses, tension ~ factor(sick)))
Analysis of Variance Table

Response: tension
              Df Sum Sq Mean Sq F value  Pr(>F)    
factor(sick)   1    681     681    12.3 0.00047 ***
Residuals    927  51178      55                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Binomial test

## Create dataset with only dead subjects
survses.dead <- subset(survses, dead == 1 & !is.na(dead))

## Recoding
library(recode)
Error: there is no package called 'recode'
survses.dead <- within(survses.dead, {
    mort1yr <- (time <= 365)
    hiedu <- (educat > 2)
    hiinc <- (income > 2)
    sick <- recode(ips, recodes = "c(0,1) = 0; c(2,3,4) = 1")
})

## Counts
addmargins(table(survses.dead$mort1yr))

FALSE  TRUE   Sum 
  643  1038  1681 

## Exact binomial test: H0 = 1yr survival is equal to 0.25
binom.test(x = 643, n = 1681, p = 0.25)

    Exact binomial test

data:  643 and 1681 
number of successes = 643, number of trials = 1681, p-value < 2.2e-16
alternative hypothesis: true probability of success is not equal to 0.25 
95 percent confidence interval:
 0.3592 0.4062 
sample estimates:
probability of success 
                0.3825 

Various table construction

## Cross table
tab.hiedu.mort1yr <- xtabs(data = survses.dead, ~ hiedu + mort1yr)[,2:1]
tab.hiedu.mort1yr
       mort1yr
hiedu   TRUE FALSE
  FALSE  418   297
  TRUE    86    72

## Include NA value
xtabs(data = survses.dead, ~ hiedu + mort1yr, na.action = na.pass, exclude = NULL)
       mort1yr
hiedu   FALSE TRUE
  FALSE   297  418
  TRUE     72   86
  <NA>    274  534

## SAS-like table
library(gmodels)
CrossTable(tab.hiedu.mort1yr)


   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|


Total Observations in Table:  873 


             | mort1yr 
       hiedu |      TRUE |     FALSE | Row Total | 
-------------|-----------|-----------|-----------|
       FALSE |       418 |       297 |       715 | 
             |     0.066 |     0.090 |           | 
             |     0.585 |     0.415 |     0.819 | 
             |     0.829 |     0.805 |           | 
             |     0.479 |     0.340 |           | 
-------------|-----------|-----------|-----------|
        TRUE |        86 |        72 |       158 | 
             |     0.298 |     0.407 |           | 
             |     0.544 |     0.456 |     0.181 | 
             |     0.171 |     0.195 |           | 
             |     0.099 |     0.082 |           | 
-------------|-----------|-----------|-----------|
Column Total |       504 |       369 |       873 | 
             |     0.577 |     0.423 |           | 
-------------|-----------|-----------|-----------|



## Another table including NA
xtabs(data = survses.dead, ~ hiedu +educat, exclude = NULL, na.action = na.pass)
       educat
hiedu     1   2   3   4 <NA>
  FALSE 291 424   0   0    0
  TRUE    0   0 122  36    0
  <NA>    0   0   0   0  808

## Cross table
xtabs(data = survses.dead, ~ hiinc + mort1yr)[,2:1]
       mort1yr
hiinc   TRUE FALSE
  FALSE  318   221
  TRUE   127   116

Table based tests

## sick * mort1yr table
tab.sick.mort1yr <- xtabs(data = survses.dead, ~ sick +mort1yr)[2:1,2:1]
tab.sick.mort1yr
    mort1yr
sick TRUE FALSE
   1  387   202
   0  632   435

## Chi-squared test
chisq.test(tab.sick.mort1yr, correct = FALSE)

    Pearson's Chi-squared test

data:  tab.sick.mort1yr 
X-squared = 6.718, df = 1, p-value = 0.009544


## Fisher's exact test
fisher.test(tab.sick.mort1yr)

    Fisher's Exact Test for Count Data

data:  tab.sick.mort1yr 
p-value = 0.009767
alternative hypothesis: true odds ratio is not equal to 1 
95 percent confidence interval:
 1.064 1.636 
sample estimates:
odds ratio 
     1.318 


##library(epiR)'s epi.2by2 function is useful for calculating risk (Inc risk),
## relative risk (Inc risk ratio), and odss ratio
library(epiR)
epi.2by2(tab.sick.mort1yr, units = 1)
             Disease +    Disease -      Total        Inc risk *        Odds
Exposed +          387          202        589             0.657        1.92
Exposed -          632          435       1067             0.592        1.45
Total             1019          637       1656             0.615        1.60

Point estimates and 95 % CIs:
---------------------------------------------------------
Inc risk ratio                         1.11 (1.03, 1.2)
Odds ratio                             1.32 (1.07, 1.63)
Attrib risk *                          0.06 (0.02, 0.11)
Attrib risk in population *            0.02 (-0.01, 0.06)
Attrib fraction in exposed (%)         9.85 (2.67, 16.51)
Attrib fraction in population (%)      3.74 (0.89, 6.51)
---------------------------------------------------------
 * Cases per population unit 

Stratified analysis

## sick * mort1yr stratified by race
tab.sick.mort1yr.race <- xtabs(data = survses.dead, ~ sick +mort1yr +race)[2:1,2:1,]
tab.sick.mort1yr.race
, , race = 1

    mort1yr
sick TRUE FALSE
   1  336   169
   0  553   395

, , race = 2

    mort1yr
sick TRUE FALSE
   1   43    31
   0   67    35

, , race = 3

    mort1yr
sick TRUE FALSE
   1    8     2
   0   11     5


## epi.2by2 can do stratified analysis too. Not a big difference here.
epi.2by2(tab.sick.mort1yr.race, units = 1)
             Disease +    Disease -      Total        Inc risk *        Odds
Exposed +          387          202        589             0.657        1.92
Exposed -          631          435       1066             0.592        1.45
Total             1018          637       1655             0.615        1.60


Point estimates and 95 % CIs:
---------------------------------------------------------
Inc risk ratio (crude)                    1.11 (1.03, 1.2)
Inc risk ratio (M-H)                      1.11 (1.03, 1.2)
Inc risk ratio (crude:M-H)                1
Odds ratio (crude)                        1.32 (1.07, 1.63)
Odds ratio (M-H)                          1.32 (1.05, 1.65)
Odds ratio (crude:M-H)                    1
Attrib risk (crude) *                     0.07 (0.02, 0.11)
Attrib risk (M-H) *                       0.06 (0, 0.13)
Attrib risk (crude:M-H)                   1.01
---------------------------------------------------------
 * Cases per population unit 

## Mantel-Haenszel test. H0 = common OR is 1
mantelhaen.test(tab.sick.mort1yr.race)

    Mantel-Haenszel chi-squared test with continuity correction

data:  tab.sick.mort1yr.race 
Mantel-Haenszel X-squared = 6.404, df = 1, p-value = 0.01138
alternative hypothesis: true common odds ratio is not equal to 1 
95 percent confidence interval:
 1.069 1.625 
sample estimates:
common odds ratio 
            1.318 


## Breslow-Day test for homogeneity of OR
epi.2by2(tab.sick.mort1yr.race, units = 1, verbose = TRUE)$OR.homog
  test.statistic df p.value
1          4.183  2  0.1235