Format RMarkdown, read data from Excel file.
## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE,
tidy = FALSE, fig.width = 8, fig.height = 7)
options(width = 116, scipen = 10)
setwd("~/statistics/bio206/")
## Read xls file
library(XLConnect)
wb <- loadWorkbook("~/statistics/bio208/survses.xls")
survses <- readWorksheet(wb, sheet = 1)
names(survses) <- tolower(names(survses))
## Check variables
str(survses)
'data.frame': 2094 obs. of 18 variables:
$ sex : num 1 1 2 2 1 1 2 2 1 1 ...
$ race : num 1 1 1 1 1 1 1 1 1 1 ...
$ ips : num 1 0 1 1 0 0 0 0 1 1 ...
$ educat : num NA NA NA NA NA NA NA NA NA NA ...
$ income : num NA NA NA NA NA NA NA NA NA NA ...
$ study : num 7751 7751 7751 7751 7751 ...
$ age : num 30 37 NA 32 20 34 16 46 30 32 ...
$ marital: num NA NA NA NA NA NA NA NA NA NA ...
$ tension: num NA NA NA NA NA NA NA NA NA NA ...
$ depress: num NA NA NA NA NA NA NA NA NA NA ...
$ anger : num NA NA NA NA NA NA NA NA NA NA ...
$ vigor : num NA NA NA NA NA NA NA NA NA NA ...
$ fatigue: num NA NA NA NA NA NA NA NA NA NA ...
$ confuse: num NA NA NA NA NA NA NA NA NA NA ...
$ total : num NA NA NA NA NA NA NA NA NA NA ...
$ time : num 2469 1271 2256 2286 2245 ...
$ dead : num 0 1 0 0 0 0 0 0 1 0 ...
$ sick : num 0 0 0 0 0 0 0 0 0 0 ...
## proc freq can be simulated with library(gmodels)'s CrossTable()
library(gmodels)
CrossTable(x = survses$sex)
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 2094
| 1 | 2 |
|-----------|-----------|
| 1324 | 770 |
| 0.632 | 0.368 |
|-----------|-----------|
## Working on multiple variables is rather awkward
junk <- with(survses, {
lapply(list(sex, race, ips),
CrossTable
)
})
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 2094
| 1 | 2 |
|-----------|-----------|
| 1324 | 770 |
| 0.632 | 0.368 |
|-----------|-----------|
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 2093
| 1 | 2 | 3 |
|-----------|-----------|-----------|
| 1810 | 247 | 36 |
| 0.865 | 0.118 | 0.017 |
|-----------|-----------|-----------|
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 2044
| 0 | 1 | 2 | 3 | 4 |
|-----------|-----------|-----------|-----------|-----------|
| 547 | 813 | 407 | 208 | 69 |
| 0.268 | 0.398 | 0.199 | 0.102 | 0.034 |
|-----------|-----------|-----------|-----------|-----------|
## Check normality
mean(survses$age, na.rm = TRUE)
[1] 59.23
sd(survses$age, na.rm = TRUE)
[1] 10.63
median(survses$age, na.rm = TRUE)
[1] 60
## quantiles
quantile(survses$age, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
0% 25% 50% 75% 100%
15 54 60 67 91
## For skewness and kurtosis use library(e1071)
library(e1071)
skewness(survses$age, na.rm = TRUE, type = 2)
[1] -0.9579
kurtosis(survses$age, na.rm = TRUE, type = 2)
[1] 1.841
## If you need to do it at once, library(psych)'s describe() is an option
library(psych)
psych::describe(survses, type = 2)
var n mean sd median trimmed mad min max range skew kurtosis se
sex 1 2094 1.37 0.48 1 1.33 0.00 1 2 1 0.55 -1.70 0.01
race 2 2093 1.15 0.40 1 1.04 0.00 1 3 2 2.68 6.80 0.01
ips 3 2044 1.24 1.06 1 1.13 1.48 0 4 4 0.71 -0.10 0.02
educat 4 1043 1.91 0.79 2 1.84 1.48 1 4 3 0.67 0.16 0.02
income 5 931 2.14 0.92 2 2.05 1.48 1 4 3 0.46 -0.61 0.03
study 6 2094 7872.26 130.82 7782 7860.23 31.13 7751 8083 332 0.60 -1.41 2.86
age 7 2093 59.23 10.63 60 60.05 10.38 15 91 76 -0.96 1.84 0.23
marital 8 1068 2.44 1.10 2 2.26 0.00 1 5 4 1.47 0.96 0.03
tension 9 944 13.00 7.48 11 12.34 7.41 1 36 35 0.74 -0.15 0.24
depress 10 846 11.97 9.98 9 10.50 7.41 1 56 55 1.32 1.56 0.34
anger 11 739 7.73 6.83 6 6.58 4.45 1 45 44 1.73 3.66 0.25
vigor 12 937 13.70 6.50 14 13.54 7.41 1 32 31 0.22 -0.50 0.21
fatigue 13 880 10.87 6.96 9 10.30 7.41 1 30 29 0.64 -0.49 0.23
confuse 14 918 7.43 4.89 6 6.87 4.45 1 28 27 1.04 0.98 0.16
total 15 921 37.06 36.97 27 32.04 26.69 1 612 611 4.85 63.36 1.22
time 16 2094 498.61 490.86 325 414.18 306.90 0 2570 2570 1.58 2.17 10.73
dead 17 2094 0.80 0.40 1 0.88 0.00 0 1 1 -1.52 0.32 0.01
sick 18 2044 0.33 0.47 0 0.29 0.00 0 1 1 0.70 -1.51 0.01
## Hmisc's describe() gives quantiles
library(Hmisc)
Hmisc::describe(survses[,c("age","depress","confuse")], type = 2)
survses[, c("age", "depress", "confuse")]
3 Variables 2094 Observations
--------------------------------------------------------------------------------------------------------------------
age
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
2093 1 70 59.23 41 46 54 60 67 71 74
lowest : 15 16 17 18 19, highest: 82 83 85 86 91
--------------------------------------------------------------------------------------------------------------------
depress
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
846 1248 49 11.97 1 2 4 9 17 27 32
lowest : 1 2 3 4 5, highest: 45 47 53 55 56
--------------------------------------------------------------------------------------------------------------------
confuse
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
918 1176 26 7.432 1 2 4 6 10 14 17
lowest : 1 2 3 4 5, highest: 22 24 25 26 28
--------------------------------------------------------------------------------------------------------------------
## Traditional scatter plot
plot(data = survses, tension ~ depress)
## ggplot2 is another graphical package
library(ggplot2)
## quick plot
qplot(data = survses, y = tension, x= depress)
## same thing with more explicit grammer
## ggplot(data = survses) + geom_point(aes(y = tension, x = depress))
## Histogram for age
hist(survses$age)
## Density plot for age with ggplot2
qplot(data = survses, x = age, geom = "density")
## Pie chart. Create table first, then plot with pie()
tab.race <- table(survses$race)
pie(tab.race)
## Boxplot
boxplot(survses$vigor)
## Boxplot with ggplot2
ggplot(survses) + geom_boxplot(aes(y = vigor, x = "vigor"))
## qunatile vs quantile plot for normality
library(car)
qqPlot(survses$vigor)
## Bar chart side by side
ggplot(survses) +
geom_bar(aes(x = factor(sex), fill = factor(race)), position = "dodge")
## Bar chart stacked
ggplot(survses) +
geom_bar(aes(x = factor(sex), fill = factor(race)), position = "stack") +
guides(fill = guide_legend(reverse = TRUE))
## Bar chart stacked (proportion)
ggplot(survses) +
geom_bar(aes(x = factor(sex), fill = factor(race)), position = "fill") +
ylab("proportion") + guides(fill = guide_legend(reverse = TRUE))
## Recoding 0,1 to 0; 2,3,4 to 1, then name the new variable sick
library(car)
survses <- within(survses, {
sick <- recode(ips, recodes = "c(0,1) = 0; c(2,3,4) = 1")
})
## t-test: one at a time
t.test(data = survses, tension ~ sick)
Welch Two Sample t-test
data: tension by sick
t = -3.363, df = 512.3, p-value = 0.0008291
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.9181 -0.7659
sample estimates:
mean in group 0 mean in group 1
12.45 14.29
t.test(data = survses, total ~ sick)
Welch Two Sample t-test
data: total by sick
t = -4.726, df = 599.6, p-value = 0.000002859
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-16.927 -6.988
sample estimates:
mean in group 0 mean in group 1
33.48 45.44
## Probably no simple way to perform multiple tests
## multi.tests() function defined here
multi.tests <- function(fun = t.test, df, vars, group.var, ...) {
sapply(simplify = FALSE,
vars,
function(var) {
formula <- as.formula(paste(var, "~", group.var))
fun(data = df, formula, ...)
}
)
}
## Multiple t-tests
res.multi.t.tests <-
multi.tests(fun = t.test,
df = survses,
vars = c("tension","depress","anger","vigor","fatigue","confuse","total"),
group.var = "sick",
var.equal = TRUE)
res.multi.t.tests
$tension
Two Sample t-test
data: tension by sick
t = -3.511, df = 927, p-value = 0.0004678
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.8716 -0.8124
sample estimates:
mean in group 0 mean in group 1
12.45 14.29
$depress
Two Sample t-test
data: depress by sick
t = -3.334, df = 830, p-value = 0.0008927
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-3.881 -1.005
sample estimates:
mean in group 0 mean in group 1
11.22 13.66
$anger
Two Sample t-test
data: anger by sick
t = -0.9467, df = 728, p-value = 0.3441
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.5820 0.5526
sample estimates:
mean in group 0 mean in group 1
7.596 8.111
$vigor
Two Sample t-test
data: vigor by sick
t = 7.813, df = 921, p-value = 0.00000000000001522
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
2.617 4.373
sample estimates:
mean in group 0 mean in group 1
14.79 11.30
$fatigue
Two Sample t-test
data: fatigue by sick
t = -6.324, df = 864, p-value = 0.0000000004085
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-4.080 -2.147
sample estimates:
mean in group 0 mean in group 1
9.89 13.00
$confuse
Two Sample t-test
data: confuse by sick
t = -4.441, df = 901, p-value = 0.00001004
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.2179 -0.8585
sample estimates:
mean in group 0 mean in group 1
6.946 8.484
$total
Two Sample t-test
data: total by sick
t = -4.546, df = 905, p-value = 0.000006195
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-17.119 -6.796
sample estimates:
mean in group 0 mean in group 1
33.48 45.44
## p-values can be extracted from the result object
data.frame(p.value = sapply(res.multi.t.tests, getElement, name = "p.value"))
p.value
tension 0.00046779024854653
depress 0.00089265010722945
anger 0.34410112363187073
vigor 0.00000000000001522
fatigue 0.00000000040851219
confuse 0.00001004401067984
total 0.00000619451833615
## Multiple Wilcoxon's rank sum tests
res.multi.wilcox.tests <-
multi.tests(fun = wilcox.test,
df = survses,
vars = c("tension","depress"),
group.var = "sick")
res.multi.wilcox.tests
$tension
Wilcoxon rank sum test with continuity correction
data: tension by sick
W = 81298, p-value = 0.001758
alternative hypothesis: true location shift is not equal to 0
$depress
Wilcoxon rank sum test with continuity correction
data: depress by sick
W = 64933, p-value = 0.0003463
alternative hypothesis: true location shift is not equal to 0
## ANOVA: Make sure the grouping variable is a factor (categorical variable in R)
anova(lm(data = survses, tension ~ factor(sick)))
Analysis of Variance Table
Response: tension
Df Sum Sq Mean Sq F value Pr(>F)
factor(sick) 1 681 681 12.3 0.00047 ***
Residuals 927 51178 55
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Create dataset with only dead subjects
survses.dead <- subset(survses, dead == 1 & !is.na(dead))
## Recoding
library(recode)
Error: there is no package called 'recode'
survses.dead <- within(survses.dead, {
mort1yr <- (time <= 365)
hiedu <- (educat > 2)
hiinc <- (income > 2)
sick <- recode(ips, recodes = "c(0,1) = 0; c(2,3,4) = 1")
})
## Counts
addmargins(table(survses.dead$mort1yr))
FALSE TRUE Sum
643 1038 1681
## Exact binomial test: H0 = 1yr survival is equal to 0.25
binom.test(x = 643, n = 1681, p = 0.25)
Exact binomial test
data: 643 and 1681
number of successes = 643, number of trials = 1681, p-value < 2.2e-16
alternative hypothesis: true probability of success is not equal to 0.25
95 percent confidence interval:
0.3592 0.4062
sample estimates:
probability of success
0.3825
## Cross table
tab.hiedu.mort1yr <- xtabs(data = survses.dead, ~ hiedu + mort1yr)[,2:1]
tab.hiedu.mort1yr
mort1yr
hiedu TRUE FALSE
FALSE 418 297
TRUE 86 72
## Include NA value
xtabs(data = survses.dead, ~ hiedu + mort1yr, na.action = na.pass, exclude = NULL)
mort1yr
hiedu FALSE TRUE
FALSE 297 418
TRUE 72 86
<NA> 274 534
## SAS-like table
library(gmodels)
CrossTable(tab.hiedu.mort1yr)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 873
| mort1yr
hiedu | TRUE | FALSE | Row Total |
-------------|-----------|-----------|-----------|
FALSE | 418 | 297 | 715 |
| 0.066 | 0.090 | |
| 0.585 | 0.415 | 0.819 |
| 0.829 | 0.805 | |
| 0.479 | 0.340 | |
-------------|-----------|-----------|-----------|
TRUE | 86 | 72 | 158 |
| 0.298 | 0.407 | |
| 0.544 | 0.456 | 0.181 |
| 0.171 | 0.195 | |
| 0.099 | 0.082 | |
-------------|-----------|-----------|-----------|
Column Total | 504 | 369 | 873 |
| 0.577 | 0.423 | |
-------------|-----------|-----------|-----------|
## Another table including NA
xtabs(data = survses.dead, ~ hiedu +educat, exclude = NULL, na.action = na.pass)
educat
hiedu 1 2 3 4 <NA>
FALSE 291 424 0 0 0
TRUE 0 0 122 36 0
<NA> 0 0 0 0 808
## Cross table
xtabs(data = survses.dead, ~ hiinc + mort1yr)[,2:1]
mort1yr
hiinc TRUE FALSE
FALSE 318 221
TRUE 127 116
## sick * mort1yr table
tab.sick.mort1yr <- xtabs(data = survses.dead, ~ sick +mort1yr)[2:1,2:1]
tab.sick.mort1yr
mort1yr
sick TRUE FALSE
1 387 202
0 632 435
## Chi-squared test
chisq.test(tab.sick.mort1yr, correct = FALSE)
Pearson's Chi-squared test
data: tab.sick.mort1yr
X-squared = 6.718, df = 1, p-value = 0.009544
## Fisher's exact test
fisher.test(tab.sick.mort1yr)
Fisher's Exact Test for Count Data
data: tab.sick.mort1yr
p-value = 0.009767
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.064 1.636
sample estimates:
odds ratio
1.318
##library(epiR)'s epi.2by2 function is useful for calculating risk (Inc risk),
## relative risk (Inc risk ratio), and odss ratio
library(epiR)
epi.2by2(tab.sick.mort1yr, units = 1)
Disease + Disease - Total Inc risk * Odds
Exposed + 387 202 589 0.657 1.92
Exposed - 632 435 1067 0.592 1.45
Total 1019 637 1656 0.615 1.60
Point estimates and 95 % CIs:
---------------------------------------------------------
Inc risk ratio 1.11 (1.03, 1.2)
Odds ratio 1.32 (1.07, 1.63)
Attrib risk * 0.06 (0.02, 0.11)
Attrib risk in population * 0.02 (-0.01, 0.06)
Attrib fraction in exposed (%) 9.85 (2.67, 16.51)
Attrib fraction in population (%) 3.74 (0.89, 6.51)
---------------------------------------------------------
* Cases per population unit
## sick * mort1yr stratified by race
tab.sick.mort1yr.race <- xtabs(data = survses.dead, ~ sick +mort1yr +race)[2:1,2:1,]
tab.sick.mort1yr.race
, , race = 1
mort1yr
sick TRUE FALSE
1 336 169
0 553 395
, , race = 2
mort1yr
sick TRUE FALSE
1 43 31
0 67 35
, , race = 3
mort1yr
sick TRUE FALSE
1 8 2
0 11 5
## epi.2by2 can do stratified analysis too. Not a big difference here.
epi.2by2(tab.sick.mort1yr.race, units = 1)
Disease + Disease - Total Inc risk * Odds
Exposed + 387 202 589 0.657 1.92
Exposed - 631 435 1066 0.592 1.45
Total 1018 637 1655 0.615 1.60
Point estimates and 95 % CIs:
---------------------------------------------------------
Inc risk ratio (crude) 1.11 (1.03, 1.2)
Inc risk ratio (M-H) 1.11 (1.03, 1.2)
Inc risk ratio (crude:M-H) 1
Odds ratio (crude) 1.32 (1.07, 1.63)
Odds ratio (M-H) 1.32 (1.05, 1.65)
Odds ratio (crude:M-H) 1
Attrib risk (crude) * 0.07 (0.02, 0.11)
Attrib risk (M-H) * 0.06 (0, 0.13)
Attrib risk (crude:M-H) 1.01
---------------------------------------------------------
* Cases per population unit
## Mantel-Haenszel test. H0 = common OR is 1
mantelhaen.test(tab.sick.mort1yr.race)
Mantel-Haenszel chi-squared test with continuity correction
data: tab.sick.mort1yr.race
Mantel-Haenszel X-squared = 6.404, df = 1, p-value = 0.01138
alternative hypothesis: true common odds ratio is not equal to 1
95 percent confidence interval:
1.069 1.625
sample estimates:
common odds ratio
1.318
## Breslow-Day test for homogeneity of OR
epi.2by2(tab.sick.mort1yr.race, units = 1, verbose = TRUE)$OR.homog
test.statistic df p.value
1 4.183 2 0.1235