## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 5, fig.height = 5)
options(width = 116, scipen = 10)
setwd("~/statistics/bio201/")
References
library(foreign)
## Load stata data
hospital <- read.dta("./HOSPITAL.DAT.dta")
## Check structure
str(hospital)
'data.frame': 25 obs. of 9 variables:
$ Id : int 1 2 3 4 5 6 7 8 9 10 ...
$ Dur_stay: int 5 10 6 11 5 14 30 11 17 3 ...
$ Age : int 30 73 40 47 25 82 60 56 43 50 ...
$ Sex : int 2 2 2 2 2 1 1 2 2 1 ...
$ Temp : num 99 98 99 98.2 98.5 96.8 99.5 98.6 98 98 ...
$ wbc : int 8 5 12 4 11 6 8 7 7 12 ...
$ Antibio : int 2 2 2 2 2 1 1 2 2 2 ...
$ Bact_cul: int 2 1 2 2 2 2 1 2 2 1 ...
$ Service : int 1 1 2 2 2 2 1 1 1 2 ...
- attr(*, "datalabel")= chr ""
- attr(*, "time.stamp")= chr ""
- attr(*, "formats")= chr "%8.0g" "%8.0g" "%8.0g" "%8.0g" ...
- attr(*, "types")= int 251 251 251 251 255 251 251 251 251
- attr(*, "val.labels")= chr "" "" "" "" ...
- attr(*, "var.labels")= chr "" "" "" "" ...
- attr(*, "version")= int 12
summary(hospital)
Id Dur_stay Age Sex Temp wbc Antibio
Min. : 1 Min. : 3.0 Min. : 4.0 Min. :1.00 Min. :96.8 Min. : 3.00 Min. :1.00
1st Qu.: 7 1st Qu.: 5.0 1st Qu.:25.0 1st Qu.:1.00 1st Qu.:98.0 1st Qu.: 5.00 1st Qu.:1.00
Median :13 Median : 8.0 Median :41.0 Median :2.00 Median :98.2 Median : 7.00 Median :2.00
Mean :13 Mean : 8.6 Mean :41.2 Mean :1.56 Mean :98.3 Mean : 7.84 Mean :1.72
3rd Qu.:19 3rd Qu.:11.0 3rd Qu.:56.0 3rd Qu.:2.00 3rd Qu.:98.6 3rd Qu.:11.00 3rd Qu.:2.00
Max. :25 Max. :30.0 Max. :82.0 Max. :2.00 Max. :99.5 Max. :14.00 Max. :2.00
Bact_cul Service
Min. :1.00 Min. :1.00
1st Qu.:2.00 1st Qu.:1.00
Median :2.00 Median :2.00
Mean :1.76 Mean :1.64
3rd Qu.:2.00 3rd Qu.:2.00
Max. :2.00 Max. :2.00
head(hospital)
Id Dur_stay Age Sex Temp wbc Antibio Bact_cul Service
1 1 5 30 2 99.0 8 2 2 1
2 2 10 73 2 98.0 5 2 1 1
3 3 6 40 2 99.0 12 2 2 2
4 4 11 47 2 98.2 4 2 2 2
5 5 5 25 2 98.5 11 2 2 2
6 6 14 82 1 96.8 6 1 2 2
## mean and median
summary(hospital$Dur_stay)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.0 5.0 8.0 8.6 11.0 30.0
## sd
sd(hospital$Dur_stay)
[1] 5.715
## range
diff(range(hospital$Dur_stay))
[1] 27
## duration of stay grouped by antibiotic use
library(doBy)
summaryBy(Dur_stay ~ Antibio, hospital, FUN = summary)
Antibio Dur_stay.Min. Dur_stay.1st Qu. Dur_stay.Median Dur_stay.Mean Dur_stay.3rd Qu. Dur_stay.Max.
1 1 3 7.5 8.0 11.60 12.50 30
2 2 3 5.0 6.5 7.44 9.75 17
## Box plot
boxplot(Dur_stay ~ Antibio, data = hospital, xlab = "Abx use")
## Create data
pos.bp <- read.table(text = "
id sbp_recumbent dbp_recumbent sbp_standing dbp_standing
1 bra 99 71 105 79
2 jab 126 74 124 76
3 flb 108 72 102 68
4 vpb 122 68 114 72
5 mfb 104 64 96 62
6 ehb 108 60 96 56
7 gc 116 70 106 70
8 mmc 106 74 106 76
9 tjf 118 82 120 90
10 rrf 92 58 88 60
11 crf 110 78 102 80
12 ewg 138 80 124 76
13 tfh 120 70 118 84
14 ejh 142 88 136 90
15 hbh 118 58 92 58
16 rtk 134 76 126 68
17 wel 118 72 108 68
18 rll 126 78 114 76
19 hsm 108 78 94 70
20 vjm 136 86 144 88
21 rhp 110 78 100 64
22 rcr 120 74 106 70
23 jar 108 74 94 74
24 akr 132 92 128 88
25 ths 102 68 96 64
26 oes 118 70 102 68
27 res 116 76 88 60
28 ect 118 80 100 84
29 jht 110 74 96 70
30 fpv 122 72 118 78
31 pfw 106 62 94 56
32 wjw 146 90 138 94
")
## Create difference data
pos.bp <- within(pos.bp, {
diff_sbp <- sbp_recumbent - sbp_standing
diff_dbp <- dbp_recumbent - dbp_standing
})
## Means and medians
lapply(pos.bp[,c("diff_sbp","diff_dbp")], summary)
$diff_sbp
Min. 1st Qu. Median Mean 3rd Qu. Max.
-8.00 4.00 8.00 8.81 14.00 28.00
$diff_dbp
Min. 1st Qu. Median Mean 3rd Qu. Max.
-14.000 -2.000 1.000 0.938 4.000 16.000
## stem-and-leaf plot
library(plyr)
l_ply(pos.bp[,c("diff_sbp","diff_dbp")], stem)
The decimal point is 1 digit(s) to the right of the |
-0 | 86
-0 | 2
0 | 022444
0 | 66688888
1 | 00022244444
1 | 68
2 |
2 | 68
The decimal point is 1 digit(s) to the right of the |
-1 | 4
-0 | 886
-0 | 444222222
0 | 00022244444444
0 | 688
1 | 4
1 | 6
## Create data for boxplot
library(reshape2)
pos.bp.melt <- melt(pos.bp[,c("diff_sbp","diff_dbp")])
## boxplots
boxplot(value ~ variable, pos.bp.melt)
## Deciles
quantile(pos.bp$diff_sbp, probs = seq(0, 1, 0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
-8.0 0.2 4.0 6.0 8.0 8.0 10.0 12.0 14.0 15.8 28.0
quantile(pos.bp$diff_sbp, probs = c(0.1, 0.9))
10% 90%
0.2 15.8
## Probabilities of disease for subjects
Pm77 <- 4.9 / 100
Pf76 <- 2.3 / 100
Pf82 <- 7.8 / 100
## 3.16 All diseased
all.diseased <- Pm77 * Pf76 * Pf82
all.diseased
[1] 0.00008791
## 3.17 At least one woman
at.least.one.woman.diseased <- 1 - ((1 - Pf76) * (1 - Pf82))
at.least.one.woman.diseased
[1] 0.09921
## 3.18 At least one diseased
no.one.diseased <- ((1 - Pm77) * (1 - Pf76) * (1 - Pf82))
at.least.one.diseased <- 1 - no.one.diseased
at.least.one.diseased
[1] 0.1433
## 3.19 Exactly one diseased
only.man.diseased <- ((Pm77) * (1 - Pf76) * (1 - Pf82))
only.woman1.diseased <- ((1 - Pm77) * (Pf76) * (1 - Pf82))
only.woman2.diseased <- ((1 - Pm77) * (1 - Pf76) * (Pf82))
exactly.one.diseased <- only.man.diseased + only.woman1.diseased + only.woman2.diseased
exactly.one.diseased
[1] 0.1368
## 3.20 Disesed is a woman given one person is diseased
man.diseased.given.one.diseased <- (only.man.diseased / exactly.one.diseased)
woman.diseased.given.one.diseased <- 1 - man.diseased.given.one.diseased
woman.diseased.given.one.diseased
[1] 0.6773
## 3.21 Diseased are both women given two persons are diseased
only.man.healthy <- ((1 - Pm77) * (Pf76) * (Pf82))
only.woman1.healthy <- ((Pm77) * (1 - Pf76) * (Pf82))
only.woman2.healthy <- ((Pm77) * (Pf76) * (1 - Pf82))
exactly.one.healthy <- only.man.healthy + only.woman1.healthy + only.woman2.healthy
two.women.diseased.given.two.diseased <- only.man.healthy / exactly.one.healthy
two.women.diseased.given.two.diseased
[1] 0.2633
## 3.22 Diseased are both in 70's given two persons are diseased
only.woman2.healthy / exactly.one.healthy
[1] 0.1604
## age-specific cases per 100 population
prevalence <-
read.table(header = TRUE, text = "
age.group males females
'65-69' 1.6 0.0
'70-74' 0.0 2.2
'75-79' 4.9 2.3
'80-84' 8.6 7.8
'85+' 35.0 27.9
")
## cases per 1 population
prevalence[,2:3] <- prevalence[,2:3] / 100
## age-specific population distribution in %
age.distribution <-
read.table(header = TRUE, text = "
age.group males females
'65-69' 5 10
'70-74' 9 17
'75-79' 11 18
'80-84' 8 12
'85+' 4 6
")
## age-specific population distribution in proportion
age.distribution[,2:3] <- age.distribution[,2:3] / 100
## 3.26 prevalence in this specific population
age.sex.specific.prevalence <- prevalence[,2:3] * age.distribution[,2:3]
sum(age.sex.specific.prevalence)
[1] 0.06105
## 3.27 Number of cases if population size is 1000
sum(age.sex.specific.prevalence) * 1000
[1] 61.05
## 3.89
5556 / 5924
[1] 0.9379
## 3.90
262513 / 265701
[1] 0.988
## 3.91
zero.additional <- (7022 - 5924) / 7022
one.additional <- ((368 - 277) + (5556 - 3916)) / 7022
two.additional <- (277 + 3916) / 7022
list(zero.additional, one.additional, two.additional)
[[1]]
[1] 0.1564
[[2]]
[1] 0.2465
[[3]]
[1] 0.5971
## 3.92
zero.additional <- (350693 - 265701) / 350693
one.additional <- ((3188 - 2444) + (262513 - 79450)) / 350693
two.additional <- (2444 + 79450) / 350693
list(zero.additional, one.additional, two.additional)
[[1]]
[1] 0.2424
[[2]]
[1] 0.5241
[[3]]
[1] 0.2335
## b relative risk
risk.1st.dead <- 368 / 5924
risk.1st.live <- 3188 / 265701
rr <- risk.1st.dead / risk.1st.live
rr
[1] 5.177
fair.test <- read.table(header = TRUE, text = "
vas dis.pos dis.neg
'≤2' 5 14
'3-4' 3 12
'5-6' 7 6
'≥7' 7 6
")
## sensitivity and specificity
cutoff.4 <- rbind(colSums(fair.test[3:4, 2:3]),
colSums(fair.test[1:2, 2:3]))
library(epiR)
epi.tests(cutoff.4)
Disease + Disease - Total
Test + 14 12 26
Test - 8 26 34
Total 22 38 60
Point estimates and 95 % CIs:
---------------------------------------------------------
Apparent prevalence 0.43 (0.31, 0.57)
True prevalence 0.37 (0.25, 0.5)
Sensitivity 0.64 (0.41, 0.83)
Specificity 0.68 (0.51, 0.82)
Positive predictive value 0.54 (0.33, 0.73)
Negative predictive value 0.76 (0.59, 0.89)
---------------------------------------------------------
## ROC
roc.data <- rbind(c(0,0), fair.test[4:1,2:3], c(0,0))
roc.2by2 <- lapply(1:5, function(x) {
head <- head(roc.data, x)
tail <- tail(roc.data, 6 - x)
rbind(test.pos = colSums(head),
test.neg = colSums(tail))
})
## Calculate sens spec
sens.spec.data <- lapply(roc.2by2, epi.tests, verbose = TRUE)
## Extract
sens.spec.data <- lapply(sens.spec.data, function(x) data.frame(specificity = x$sp$est, sensitivity = x$se$est))
## Collapse as data frame
sens.spec.data <- do.call(rbind, sens.spec.data)
## Plot
library(ggplot2)
ggplot(sens.spec.data, aes(x = 1 - specificity, y = sensitivity)) + geom_point() + geom_line()