Biostatistics 201: Fundamentals of Statistics Problem sets 1

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE, 
    echo = TRUE, fig.width = 5, fig.height = 5)
options(width = 116, scipen = 10)

setwd("~/statistics/bio201/")

References

Problems 2.1-2.3

library(foreign)

## Load stata data
hospital <- read.dta("./HOSPITAL.DAT.dta")

## Check structure
str(hospital)
'data.frame':   25 obs. of  9 variables:
 $ Id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Dur_stay: int  5 10 6 11 5 14 30 11 17 3 ...
 $ Age     : int  30 73 40 47 25 82 60 56 43 50 ...
 $ Sex     : int  2 2 2 2 2 1 1 2 2 1 ...
 $ Temp    : num  99 98 99 98.2 98.5 96.8 99.5 98.6 98 98 ...
 $ wbc     : int  8 5 12 4 11 6 8 7 7 12 ...
 $ Antibio : int  2 2 2 2 2 1 1 2 2 2 ...
 $ Bact_cul: int  2 1 2 2 2 2 1 2 2 1 ...
 $ Service : int  1 1 2 2 2 2 1 1 1 2 ...
 - attr(*, "datalabel")= chr ""
 - attr(*, "time.stamp")= chr ""
 - attr(*, "formats")= chr  "%8.0g" "%8.0g" "%8.0g" "%8.0g" ...
 - attr(*, "types")= int  251 251 251 251 255 251 251 251 251
 - attr(*, "val.labels")= chr  "" "" "" "" ...
 - attr(*, "var.labels")= chr  "" "" "" "" ...
 - attr(*, "version")= int 12
summary(hospital)
       Id        Dur_stay         Age            Sex            Temp           wbc           Antibio    
 Min.   : 1   Min.   : 3.0   Min.   : 4.0   Min.   :1.00   Min.   :96.8   Min.   : 3.00   Min.   :1.00  
 1st Qu.: 7   1st Qu.: 5.0   1st Qu.:25.0   1st Qu.:1.00   1st Qu.:98.0   1st Qu.: 5.00   1st Qu.:1.00  
 Median :13   Median : 8.0   Median :41.0   Median :2.00   Median :98.2   Median : 7.00   Median :2.00  
 Mean   :13   Mean   : 8.6   Mean   :41.2   Mean   :1.56   Mean   :98.3   Mean   : 7.84   Mean   :1.72  
 3rd Qu.:19   3rd Qu.:11.0   3rd Qu.:56.0   3rd Qu.:2.00   3rd Qu.:98.6   3rd Qu.:11.00   3rd Qu.:2.00  
 Max.   :25   Max.   :30.0   Max.   :82.0   Max.   :2.00   Max.   :99.5   Max.   :14.00   Max.   :2.00  
    Bact_cul       Service    
 Min.   :1.00   Min.   :1.00  
 1st Qu.:2.00   1st Qu.:1.00  
 Median :2.00   Median :2.00  
 Mean   :1.76   Mean   :1.64  
 3rd Qu.:2.00   3rd Qu.:2.00  
 Max.   :2.00   Max.   :2.00  
head(hospital)
  Id Dur_stay Age Sex Temp wbc Antibio Bact_cul Service
1  1        5  30   2 99.0   8       2        2       1
2  2       10  73   2 98.0   5       2        1       1
3  3        6  40   2 99.0  12       2        2       2
4  4       11  47   2 98.2   4       2        2       2
5  5        5  25   2 98.5  11       2        2       2
6  6       14  82   1 96.8   6       1        2       2

## mean and median
summary(hospital$Dur_stay)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    3.0     5.0     8.0     8.6    11.0    30.0 

## sd
sd(hospital$Dur_stay)
[1] 5.715

## range
diff(range(hospital$Dur_stay))
[1] 27

## duration of stay grouped by antibiotic use
library(doBy)
summaryBy(Dur_stay ~ Antibio, hospital, FUN = summary)
  Antibio Dur_stay.Min. Dur_stay.1st Qu. Dur_stay.Median Dur_stay.Mean Dur_stay.3rd Qu. Dur_stay.Max.
1       1             3              7.5             8.0         11.60            12.50            30
2       2             3              5.0             6.5          7.44             9.75            17

## Box plot
boxplot(Dur_stay ~ Antibio, data = hospital, xlab = "Abx use")

plot of chunk unnamed-chunk-2

Problems 2.19-2.22

## Create data
pos.bp <- read.table(text = "
    id sbp_recumbent dbp_recumbent sbp_standing dbp_standing
1  bra            99            71          105           79
2  jab           126            74          124           76
3  flb           108            72          102           68
4  vpb           122            68          114           72
5  mfb           104            64           96           62
6  ehb           108            60           96           56
7   gc           116            70          106           70
8  mmc           106            74          106           76
9  tjf           118            82          120           90
10 rrf            92            58           88           60
11 crf           110            78          102           80
12 ewg           138            80          124           76
13 tfh           120            70          118           84
14 ejh           142            88          136           90
15 hbh           118            58           92           58
16 rtk           134            76          126           68
17 wel           118            72          108           68
18 rll           126            78          114           76
19 hsm           108            78           94           70
20 vjm           136            86          144           88
21 rhp           110            78          100           64
22 rcr           120            74          106           70
23 jar           108            74           94           74
24 akr           132            92          128           88
25 ths           102            68           96           64
26 oes           118            70          102           68
27 res           116            76           88           60
28 ect           118            80          100           84
29 jht           110            74           96           70
30 fpv           122            72          118           78
31 pfw           106            62           94           56
32 wjw           146            90          138           94
")

## Create difference data
pos.bp <- within(pos.bp, {
    diff_sbp <- sbp_recumbent - sbp_standing
    diff_dbp <- dbp_recumbent - dbp_standing
})

## Means and medians
lapply(pos.bp[,c("diff_sbp","diff_dbp")], summary)
$diff_sbp
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -8.00    4.00    8.00    8.81   14.00   28.00 

$diff_dbp
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-14.000  -2.000   1.000   0.938   4.000  16.000 


## stem-and-leaf plot
library(plyr)
l_ply(pos.bp[,c("diff_sbp","diff_dbp")], stem)

  The decimal point is 1 digit(s) to the right of the |

  -0 | 86
  -0 | 2
   0 | 022444
   0 | 66688888
   1 | 00022244444
   1 | 68
   2 | 
   2 | 68


  The decimal point is 1 digit(s) to the right of the |

  -1 | 4
  -0 | 886
  -0 | 444222222
   0 | 00022244444444
   0 | 688
   1 | 4
   1 | 6



## Create data for boxplot
library(reshape2)
pos.bp.melt <- melt(pos.bp[,c("diff_sbp","diff_dbp")])

## boxplots
boxplot(value ~ variable, pos.bp.melt)

plot of chunk unnamed-chunk-3


## Deciles
quantile(pos.bp$diff_sbp, probs = seq(0, 1, 0.1))
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
-8.0  0.2  4.0  6.0  8.0  8.0 10.0 12.0 14.0 15.8 28.0 
quantile(pos.bp$diff_sbp, probs = c(0.1, 0.9))
 10%  90% 
 0.2 15.8 

Problems 3.16-3.22, 3.26-3.27

## Probabilities of disease for subjects
Pm77 <- 4.9 / 100
Pf76 <- 2.3 / 100
Pf82 <- 7.8 / 100

## 3.16 All diseased
all.diseased <- Pm77 * Pf76 * Pf82
all.diseased
[1] 0.00008791

## 3.17 At least one woman
at.least.one.woman.diseased <- 1 - ((1 - Pf76) * (1 - Pf82))
at.least.one.woman.diseased
[1] 0.09921

## 3.18 At least one diseased
no.one.diseased <- ((1 - Pm77) * (1 - Pf76) * (1 - Pf82))
at.least.one.diseased <- 1 - no.one.diseased
at.least.one.diseased
[1] 0.1433

## 3.19 Exactly one diseased
only.man.diseased    <- ((Pm77) * (1 - Pf76) * (1 - Pf82))
only.woman1.diseased <- ((1 - Pm77) * (Pf76) * (1 - Pf82))
only.woman2.diseased <- ((1 - Pm77) * (1 - Pf76) * (Pf82))

exactly.one.diseased <- only.man.diseased + only.woman1.diseased + only.woman2.diseased
exactly.one.diseased
[1] 0.1368

## 3.20 Disesed is a woman given one person is diseased
man.diseased.given.one.diseased <- (only.man.diseased / exactly.one.diseased)
woman.diseased.given.one.diseased <- 1 - man.diseased.given.one.diseased
woman.diseased.given.one.diseased
[1] 0.6773

## 3.21 Diseased are both women given two persons are diseased
only.man.healthy    <- ((1 - Pm77) * (Pf76) * (Pf82))
only.woman1.healthy <- ((Pm77) * (1 - Pf76) * (Pf82))
only.woman2.healthy <- ((Pm77) * (Pf76) * (1 - Pf82))

exactly.one.healthy <- only.man.healthy + only.woman1.healthy + only.woman2.healthy
two.women.diseased.given.two.diseased <- only.man.healthy / exactly.one.healthy
two.women.diseased.given.two.diseased
[1] 0.2633

## 3.22 Diseased are both in 70's given two persons are diseased
only.woman2.healthy / exactly.one.healthy
[1] 0.1604

## age-specific cases per 100 population
prevalence <-
    read.table(header = TRUE, text = "
age.group       males   females
'65-69'         1.6     0.0
'70-74'         0.0     2.2
'75-79'         4.9     2.3
'80-84'         8.6     7.8
'85+'          35.0    27.9
")
## cases per 1 population
prevalence[,2:3] <- prevalence[,2:3] / 100
## age-specific population distribution in %
age.distribution <-
    read.table(header = TRUE, text = "
age.group       males   females
'65-69'          5     10
'70-74'          9     17
'75-79'         11     18
'80-84'          8     12
'85+'            4      6
")
## age-specific population distribution in proportion
age.distribution[,2:3] <- age.distribution[,2:3] / 100

## 3.26 prevalence in this specific population
age.sex.specific.prevalence <- prevalence[,2:3] * age.distribution[,2:3]
sum(age.sex.specific.prevalence)
[1] 0.06105

## 3.27 Number of cases if population size is 1000
sum(age.sex.specific.prevalence) * 1000
[1] 61.05

3.89-3.92

## 3.89
5556 / 5924
[1] 0.9379

## 3.90
262513 / 265701
[1] 0.988

## 3.91
zero.additional <- (7022 - 5924) / 7022
one.additional <- ((368 - 277) + (5556 - 3916)) / 7022
two.additional <- (277 + 3916) / 7022

list(zero.additional, one.additional, two.additional)
[[1]]
[1] 0.1564

[[2]]
[1] 0.2465

[[3]]
[1] 0.5971


## 3.92
zero.additional <- (350693 - 265701) / 350693
one.additional <- ((3188 - 2444) + (262513 - 79450)) / 350693
two.additional <- (2444 + 79450) / 350693

list(zero.additional, one.additional, two.additional)
[[1]]
[1] 0.2424

[[2]]
[1] 0.5241

[[3]]
[1] 0.2335


## b relative risk
risk.1st.dead <- 368 / 5924
risk.1st.live <- 3188 / 265701

rr <- risk.1st.dead / risk.1st.live
rr
[1] 5.177

3.114-3.117

fair.test <- read.table(header = TRUE, text = "
vas           dis.pos       dis.neg
'≤2'            5               14
'3-4'           3               12
'5-6'           7                6
'≥7'            7                6
")

## sensitivity and specificity
cutoff.4 <- rbind(colSums(fair.test[3:4, 2:3]),
                  colSums(fair.test[1:2, 2:3]))
library(epiR)
epi.tests(cutoff.4)
          Disease +    Disease -      Total
Test +           14           12         26
Test -            8           26         34
Total            22           38         60

Point estimates and 95 % CIs:
---------------------------------------------------------
Apparent prevalence                    0.43 (0.31, 0.57)
True prevalence                        0.37 (0.25, 0.5)
Sensitivity                            0.64 (0.41, 0.83)
Specificity                            0.68 (0.51, 0.82)
Positive predictive value              0.54 (0.33, 0.73)
Negative predictive value              0.76 (0.59, 0.89)
---------------------------------------------------------


## ROC
roc.data <- rbind(c(0,0), fair.test[4:1,2:3], c(0,0))

roc.2by2 <- lapply(1:5, function(x) {
    head <- head(roc.data, x)
    tail <- tail(roc.data, 6 - x)
    rbind(test.pos = colSums(head),
          test.neg = colSums(tail))
})

## Calculate sens spec
sens.spec.data <- lapply(roc.2by2, epi.tests, verbose = TRUE)

## Extract
sens.spec.data <- lapply(sens.spec.data, function(x) data.frame(specificity = x$sp$est, sensitivity = x$se$est))

## Collapse as data frame
sens.spec.data <- do.call(rbind, sens.spec.data)

## Plot
library(ggplot2)
ggplot(sens.spec.data, aes(x = 1 - specificity, y = sensitivity)) + geom_point() + geom_line()

plot of chunk unnamed-chunk-6