Epidemiology in R

library(epiR)

## Loading required package: survival

## Package epiR 0.9-93 is loaded

## Type help(epi.about) for summary information

##

dat <- matrix(c(13,2163,5,3349), nrow = 2, byrow = TRUE)
rownames(dat) <- c("DF+", "DF-"); colnames(dat) <- c("FUS+", "FUS-"); dat

##     FUS+ FUS-
## DF+   13 2163
## DF-    5 3349

epi.2by2(dat = as.table(dat), method = "cross.sectional",
conf.level = 0.95, units = 100, homogeneity = "breslow.day",
outcome = "as.columns")

##              Outcome +    Outcome -      Total        Prevalence *
## Exposed +           13         2163       2176               0.597
## Exposed -            5         3349       3354               0.149
## Total               18         5512       5530               0.325
##                  Odds
## Exposed +     0.00601
## Exposed -     0.00149
## Total         0.00327
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Prevalence ratio                             4.01 (1.43, 11.23)
## Odds ratio                                   4.03 (1.43, 11.31)
## Attrib prevalence *                          0.45 (0.10, 0.80)
## Attrib prevalence in population *            0.18 (-0.02, 0.38)
## Attrib fraction in exposed (%)              75.05 (30.11, 91.09)
## Attrib fraction in population (%)           54.20 (3.61, 78.24)
## -------------------------------------------------------------------
##  X2 test statistic: 8.177 p-value: 0.004
##  Wald confidence limits
##  * Outcomes per 100 population units

dat <- matrix(c(13,5,2163,3349), nrow = 2, byrow = TRUE)
rownames(dat) <- c("FUS+", "FUS-"); colnames(dat) <- c("DF+", "DF-"); dat

##       DF+  DF-
## FUS+   13    5
## FUS- 2163 3349

epi.2by2(dat = as.table(dat), method = "cross.sectional",
conf.level = 0.95, units = 100, homogeneity = "breslow.day",
outcome = "as.rows")

##              Exposed +    Exposed -      Total
## Outcome +           13            5         18
## Outcome -         2163         3349       5512
## Total             2176         3354       5530
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Prevalence ratio                             4.01 (1.43, 11.23)
## Odds ratio                                   4.03 (1.43, 11.31)
## Attrib prevalence *                          0.45 (0.10, 0.80)
## Attrib prevalence in population *            0.18 (-0.02, 0.38)
## Attrib fraction in exposed (%)              75.05 (30.11, 91.09)
## Attrib fraction in population (%)           54.20 (3.61, 78.24)
## -------------------------------------------------------------------
##  X2 test statistic: 8.177 p-value: 0.004
##  Wald confidence limits
##  * Outcomes per 100 population units

library(MASS)
dat1 <- birthwt; head(dat1)

##    low age lwt race smoke ptl ht ui ftv  bwt
## 85   0  19 182    2     0   0  0  1   0 2523
## 86   0  33 155    3     0   0  0  0   3 2551
## 87   0  20 105    1     1   0  0  0   1 2557
## 88   0  21 108    1     1   0  0  1   2 2594
## 89   0  18 107    1     1   0  0  1   0 2600
## 91   0  21 124    3     0   0  0  0   0 2622

dat1$low <- factor(dat1$low, levels = c(1,0))
dat1$smoke <- factor(dat1$smoke, levels = c(1,0))
dat1$race <- factor(dat1$race, levels = c(1,2,3))
tab1 <- table(dat1$smoke, dat1$low, dnn = c("Smoke", "Low BW"))
print(tab1)

##      Low BW
## Smoke  1  0
##     1 30 44
##     0 29 86

epi.2by2(dat = tab1, method = "cohort.count",
conf.level = 0.95, units = 100, homogeneity = "breslow.day",
outcome = "as.columns")

##              Outcome +    Outcome -      Total        Inc risk *
## Exposed +           30           44         74              40.5
## Exposed -           29           86        115              25.2
## Total               59          130        189              31.2
##                  Odds
## Exposed +       0.682
## Exposed -       0.337
## Total           0.454
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio                               1.61 (1.06, 2.44)
## Odds ratio                                   2.02 (1.08, 3.78)
## Attrib risk *                                15.32 (1.61, 29.04)
## Attrib risk in population *                  6.00 (-4.33, 16.33)
## Attrib fraction in exposed (%)               37.80 (5.47, 59.07)
## Attrib fraction in population (%)            19.22 (-0.21, 34.88)
## -------------------------------------------------------------------
##  X2 test statistic: 4.924 p-value: 0.026
##  Wald confidence limits
##  * Outcomes per 100 population units

tab2 <- table(dat1$smoke, dat1$low, dat1$race,
dnn = c("Smoke", "Low BW", "Race"))
print(tab2)

## , , Race = 1
## 
##      Low BW
## Smoke  1  0
##     1 19 33
##     0  4 40
## 
## , , Race = 2
## 
##      Low BW
## Smoke  1  0
##     1  6  4
##     0  5 11
## 
## , , Race = 3
## 
##      Low BW
## Smoke  1  0
##     1  5  7
##     0 20 35

rval <- epi.2by2(dat = tab2, method = "cohort.count",
conf.level = 0.95, units = 100, homogeneity = "breslow.day",
outcome = "as.columns")
print(rval)

##              Outcome +    Outcome -      Total        Inc risk *
## Exposed +           30           44         74              40.5
## Exposed -           29           86        115              25.2
## Total               59          130        189              31.2
##                  Odds
## Exposed +       0.682
## Exposed -       0.337
## Total           0.454
## 
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio (crude)                       1.61 (1.06, 2.44)
## Inc risk ratio (M-H)                         2.15 (1.29, 3.58)
## Inc risk ratio (crude:M-H)                   0.75
## Odds ratio (crude)                           2.02 (1.08, 3.78)
## Odds ratio (M-H)                             3.09 (1.49, 6.39)
## Odds ratio (crude:M-H)                       0.66
## Attrib risk (crude) *                        15.32 (1.61, 29.04)
## Attrib risk (M-H) *                          22.17 (1.41, 42.94)
## Attrib risk (crude:M-H)                      0.69
## -------------------------------------------------------------------
##  Test of homogeneity of IRR: X2 test statistic: 3.815 p-value: 0.148
##  Test of homogeneity of  OR: X2 test statistic: 3.126 p-value: 0.21
##  Wald confidence limits
##  M-H: Mantel-Haenszel
##  * Outcomes per 100 population units

rval$massoc$ARisk.mh.green

##        est    lower    upper
## 1 22.17306 8.797794 35.54832

rval$massoc$ARisk.mh.wald

##        est    lower    upper
## 1 22.17306 1.410787 42.93532

dat2 <- data.frame(tab2)
print(dat2)

##    Smoke Low.BW Race Freq
## 1      1      1    1   19
## 2      0      1    1    4
## 3      1      0    1   33
## 4      0      0    1   40
## 5      1      1    2    6
## 6      0      1    2    5
## 7      1      0    2    4
## 8      0      0    2   11
## 9      1      1    3    5
## 10     0      1    3   20
## 11     1      0    3    7
## 12     0      0    3   35

tab3 <- xtabs(Freq ~ Smoke + Low.BW + Race, data = dat2)
print(tab3)

## , , Race = 1
## 
##      Low.BW
## Smoke  1  0
##     1 19 33
##     0  4 40
## 
## , , Race = 2
## 
##      Low.BW
## Smoke  1  0
##     1  6  4
##     0  5 11
## 
## , , Race = 3
## 
##      Low.BW
## Smoke  1  0
##     1  5  7
##     0 20 35

rval <- epi.2by2(dat = tab3, method = "cohort.count",
conf.level = 0.95, units = 100, homogeneity = "breslow.day",
outcome = "as.columns")
print(rval)

##              Outcome +    Outcome -      Total        Inc risk *
## Exposed +           30           44         74              40.5
## Exposed -           29           86        115              25.2
## Total               59          130        189              31.2
##                  Odds
## Exposed +       0.682
## Exposed -       0.337
## Total           0.454
## 
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio (crude)                       1.61 (1.06, 2.44)
## Inc risk ratio (M-H)                         2.15 (1.29, 3.58)
## Inc risk ratio (crude:M-H)                   0.75
## Odds ratio (crude)                           2.02 (1.08, 3.78)
## Odds ratio (M-H)                             3.09 (1.49, 6.39)
## Odds ratio (crude:M-H)                       0.66
## Attrib risk (crude) *                        15.32 (1.61, 29.04)
## Attrib risk (M-H) *                          22.17 (1.41, 42.94)
## Attrib risk (crude:M-H)                      0.69
## -------------------------------------------------------------------
##  Test of homogeneity of IRR: X2 test statistic: 3.815 p-value: 0.148
##  Test of homogeneity of  OR: X2 test statistic: 3.126 p-value: 0.21
##  Wald confidence limits
##  M-H: Mantel-Haenszel
##  * Outcomes per 100 population units

rval$massoc$OR.crude.cfield

##        est    lower    upper
## 1 2.021944 1.073694 3.794586

rval$massoc$OR.crude.mle

##        est   lower    upper
## 1 2.014137 1.02878 3.964904

rval$massoc$OR.crude.score

##        est    lower    upper
## 1 2.021944 1.084385 3.773885

library(ggplot2); library(scales)
nstrata <- 1:dim(tab3)[3]
strata.lab <- paste("Strata ", nstrata, sep = "")
y.at <- c(nstrata, max(nstrata) + 1)
y.lab <- c("M-H", strata.lab)
x.at <- c(0.25, 0.5, 1, 2, 4, 8, 16, 32)
or.l <- c(rval$massoc$OR.mh$lower, rval$massoc$OR.strata.cfield$lower)
or.u <- c(rval$massoc$OR.mh$upper, rval$massoc$OR.strata.cfield$upper)
or.p <- c(rval$massoc$OR.mh$est, rval$massoc$OR.strata.cfield$est)
dat <- data.frame(y.at, y.lab, or.p, or.l, or.u)
ggplot(dat, aes(or.p, y.at)) +
geom_point() +
geom_errorbarh(aes(xmax = or.l, xmin = or.u, height = 0.2)) +
labs(x = "Odds ratio", y = "Strata") +
scale_x_continuous(trans = log2_trans(), breaks = x.at,
limits = c(0.25,32)) + scale_y_continuous(breaks = y.at, labels = y.lab) +
geom_vline(xintercept = 1, lwd = 1) + coord_fixed(ratio = 0.75 / 1) +
theme(axis.title.y = element_text(vjust = 0))

dat <- as.table(matrix(c(136,22050,1709,127650), nrow = 2, byrow = TRUE))
rval <- epi.2by2(dat = dat, method = "cohort.time", conf.level = 0.95,
units = 1000, homogeneity = "breslow.day", outcome = "as.columns")
summary(rval)$ARate.strata.wald

##        est     lower     upper
## 1 -7.22037 -8.435865 -6.004875

round(summary(rval)$IRR.strata.wald, digits = 2)

##    est lower upper
## 1 0.46  0.38  0.55

hid <- 1:8
ai <- c(23,10,20,5,14,6,10,3)
bi <- c(10,2,1,2,2,2,3,0)
ci <- c(3,2,3,2,1,3,3,2)
di <- c(6,4,3,2,6,3,1,1)
dat <- data.frame(hid, ai, bi, ci, di)
print(dat)

##   hid ai bi ci di
## 1   1 23 10  3  6
## 2   2 10  2  2  4
## 3   3 20  1  3  3
## 4   4  5  2  2  2
## 5   5 14  2  1  6
## 6   6  6  2  3  3
## 7   7 10  3  3  1
## 8   8  3  0  2  1

hid <- rep(1:8, times = 4)
exp <- factor(rep(c(1,1,0,0), each = 8), levels = c(1,0))
out <- factor(rep(c(1,0,1,0), each = 8), levels = c(1,0))
dat <- data.frame(hid, exp, out, n = c(ai,bi,ci,di))
dat <- xtabs(n ~ exp + out + hid, data = dat)
print(dat)

## , , hid = 1
## 
##    out
## exp  1  0
##   1 23 10
##   0  3  6
## 
## , , hid = 2
## 
##    out
## exp  1  0
##   1 10  2
##   0  2  4
## 
## , , hid = 3
## 
##    out
## exp  1  0
##   1 20  1
##   0  3  3
## 
## , , hid = 4
## 
##    out
## exp  1  0
##   1  5  2
##   0  2  2
## 
## , , hid = 5
## 
##    out
## exp  1  0
##   1 14  2
##   0  1  6
## 
## , , hid = 6
## 
##    out
## exp  1  0
##   1  6  2
##   0  3  3
## 
## , , hid = 7
## 
##    out
## exp  1  0
##   1 10  3
##   0  3  1
## 
## , , hid = 8
## 
##    out
## exp  1  0
##   1  3  0
##   0  2  1

epi.2by2(dat = dat, method = "cohort.count", homogeneity = "breslow.day",
outcome= "as.columns")

##              Outcome +    Outcome -      Total        Inc risk *
## Exposed +           91           22        113              80.5
## Exposed -           19           26         45              42.2
## Total              110           48        158              69.6
##                  Odds
## Exposed +       4.136
## Exposed -       0.731
## Total           2.292
## 
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio (crude)                       1.91 (1.34, 2.72)
## Inc risk ratio (M-H)                         1.94 (1.35, 2.78)
## Inc risk ratio (crude:M-H)                   0.98
## Odds ratio (crude)                           5.66 (2.67, 12.02)
## Odds ratio (M-H)                             5.97 (2.72, 13.13)
## Odds ratio (crude:M-H)                       0.95
## Attrib risk (crude) *                        38.31 (22.14, 54.48)
## Attrib risk (M-H) *                          39.21 (21.11, 57.30)
## Attrib risk (crude:M-H)                      0.98
## -------------------------------------------------------------------
##  Test of homogeneity of IRR: X2 test statistic: 15.207 p-value: 0.033
##  Test of homogeneity of  OR: X2 test statistic: 6.452 p-value: 0.488
##  Wald confidence limits
##  M-H: Mantel-Haenszel
##  * Outcomes per 100 population units

## If diagnostic sensitivity of a test is 0.90, and 80% certain that the ##sensitivity is greater than 0.75, what are the shape1 and shape2 parameters for a beta distribution satisfying these constraints?
rval <- epi.betabuster(mode = 0.90, conf = 0.80, greaterthan = TRUE,
x = 0.75, conf.level = 0.95, max.shape1 = 100, step = 0.001)
rval$shape1; rval$shape2

## [1] 9.875

## [1] 1.986111

r <- rval$shape1 - 1; r

## [1] 8.875

## from 10 trials, n:
n <- rval$shape2 + rval$shape1 - 2; n

## [1] 9.861111

dat <- data.frame(x = seq(from = 0, to = 1, by = 0.001),
y = dbeta(x = seq(from = 0, to = 1,by = 0.001),
shape1 = rval$shape1, shape2 = rval$shape2))
## Density plot of the estimated beta distribution
library(ggplot2)
ggplot(data = dat, aes(x = x, y = y)) +
geom_line() +
xlab("Test sensitivity") +
  ylab("Density")

#Bohning's test for overdispersion of Poisson data
data(epi.SClip)
obs <- epi.SClip$cases
pop <- epi.SClip$population
exp <- (sum(obs) / sum(pop)) * pop
epi.bohning(obs, exp, alpha = 0.05)

##   test.statistic p.value
## 1       53.33841       0

#Concordance correlation coefficient
set.seed(seed = 1234)
method1 <- rnorm(n = 100, mean = 0, sd = 1)
method2 <- method1 + runif(n = 100, min = -0.25, max = 0.25)
## Introduce some missing values:
method1[50] <- NA
method2[75] <- NA
tmp <- data.frame(method1, method2)
tmp.ccc <- epi.ccc(method1, method2, ci = "z-transform", conf.level = 0.95,
rep.measure = FALSE)
tmp.lab <- data.frame(lab = paste("CCC: ",
round(tmp.ccc$rho.c[,1], digits = 2), " (95% CI ",
round(tmp.ccc$rho.c[,2], digits = 2), " - ",
round(tmp.ccc$rho.c[,3], digits = 2), ")", sep = ""))
z <- lm(method2 ~ method1)
alpha <- summary(z)$coefficients[1,1]
beta <- summary(z)$coefficients[2,1]
tmp.lm <- data.frame(alpha, beta)

## Concordance correlation plot:
library(ggplot2)
ggplot(tmp, aes(x = method1, y = method2)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
geom_abline(data = tmp.lm, aes(intercept = alpha, slope = beta),
linetype = "dashed") +
xlim(0, 3) +
ylim(0, 3) +
xlab("Method 1") +
ylab("Method 2") +
geom_text(data = tmp.lab, x = 0.5, y = 2.95, label = tmp.lab$lab) +
coord_fixed(ratio = 1 / 1)

## Warning: Removed 67 rows containing missing values (geom_point).

x <- c(494,395,516,434,476,557,413,442,650,433,417,656,267,
478,178,423,427)
y <- c(512,430,520,428,500,600,364,380,658,445,432,626,260,
477,259,350,451)
tmp.ccc <- epi.ccc(x, y, ci = "z-transform", conf.level = 0.95,
rep.measure = FALSE)
tmp <- data.frame(mean = tmp.ccc$blalt[,1], delta = tmp.ccc$blalt[,2])

library(ggplot2)
ggplot(tmp.ccc$blalt, aes(x = mean, y = delta)) +
geom_point() +
geom_hline(data = tmp.ccc$sblalt, aes(yintercept = lower), linetype = 2) +
geom_hline(data = tmp.ccc$sblalt, aes(yintercept = upper), linetype = 2) +
geom_hline(data = tmp.ccc$sblalt, aes(yintercept = est), linetype = 1) +
xlab("Average PEFR by two meters (L/min)") +
ylab("Difference in PEFR (L/min)") +
xlim(0,800) +
ylim(-150,150)

#Sample size, power or minimum detectable odds ratio for an unmatched
#or matched case-control study
## A case-control study of the relationship between smoking and CHD is
## planned. A sample of men with newly diagnosed CHD will be compared for
## smoking status with a sample of controls. Assuming an equal number of
## cases and controls, how many study subject are required to detect an
## odds ratio of 2.0 with 0.90 power using a two-sided 0.05 test? Previous
## surveys have shown that around 0.30 of males without CHD are smokers.
epi.ccsize(OR = 2.0, p0 = 0.30, n = NA, power = 0.90, r = 1, rho = 0,
design = 1, sided.test = 2, conf.level = 0.95, method = "unmatched",
fleiss = FALSE)

## $n.total
## [1] 376
## 
## $n.case
## [1] 188
## 
## $n.control
## [1] 188
## 
## $power
## [1] 0.9
## 
## $OR
## [1] 2

## A total of 376 men need to be sampled: 188 cases and 188 controls

## Suppose we wish to determine the power to detect an odds ratio of 2.0
## using a two-sided 0.05 test when 188 cases and 940 controls
## are available (that is, the ratio of controls to cases is 5:1). Assume
## the prevalence of smoking in males without CHD is 0.30.
n <- 188 + 940
epi.ccsize(OR = 2.0, p0 = 0.30, n = n, power = NA, r = 5, rho = 0,
design = 1, sided.test = 2, conf.level = 0.95, method = "unmatched",
fleiss = TRUE)

## $n.total
## [1] 1128
## 
## $n.case
## [1] 188
## 
## $n.control
## [1] 940
## 
## $power
## [1] 0.9880212
## 
## $OR
## [1] 2

## The power of this study, with the given sample size allocation is 0.99.

## The following statement appeared in a study proposal to identify risk
## factors for campylobacteriosis in humans:
## `We will prospectively recruit 300 culture-confirmed Campylobacter cases
## reported under the Public Health Act. We will then recruit one control per
## case from general practices of the enrolled cases, using frequency matching
## by age and sex. With exposure levels of 10% (thought to be realistic
## given past foodborne disease case control studies) this sample size
## will provide 80% power to detect an odds ratio of 2 at the 5% alpha
## level.'
## Confirm the statement that 300 case subjects will provide 80% power in
## this study.
epi.ccsize(OR = 2.0, p0 = 0.10, n = 600, power = NA, r = 1, rho = 0.01,
design = 1, sided.test = 2, conf.level = 0.95, method = "matched",
fleiss = TRUE)

## $n.total
## [1] 600
## 
## $n.case
## [1] 300
## 
## $n.control
## [1] 300
## 
## $power
## [1] 0.8265727
## 
## $OR
## [1] 2

## If the true odds ratio for Campylobacter in exposed subjects relative to
## unexposed subjects is 2.0 we will be able to reject the null hypothesis
## that this odds ratio equals 1 with probability (power) 0.826. The Type I
# error probability associated with this test of this null hypothesis is 0.05.

## We wish to conduct a case-control study to assess whether bladder cancer
## may be associated with past exposure to cigarette smoking. Cases will be
## patients with bladder cancer and controls will be patients hospitalised
## for injury. It is assumed that 20% of controls will be smokers or past
## smokers, and we wish to detect an odds ratio of 2 with power 90%.
## Three controls will be recruited for every case. How many subjects need
## to be enrolled in the study?
epi.ccsize(OR = 2.0, p0 = 0.20, n = NA, power = 0.90, r = 3, rho = 0,
design = 1, sided.test = 2, conf.level = 0.95, method = "unmatched",
fleiss = FALSE)

## $n.total
## [1] 620
## 
## $n.case
## [1] 155
## 
## $n.control
## [1] 465
## 
## $power
## [1] 0.9
## 
## $OR
## [1] 2

## A total of 620 subjects need to be enrolled in the study: 155 cases and
## 465 controls.

## An alternative is to conduct a matched case-control study rather than the
## unmatched design outlined above. One case will be matched to one control
## and the correlation between case and control exposures for matched pairs
## (rho) is estimated to be 0.01 (low). Using the same assumptions as those
## described above, how many study subjects will be required?
epi.ccsize(OR = 2.0, p0 = 0.20, n = NA, power = 0.90, r = 1, rho = 0.01,
design = 1, sided.test = 2, conf.level = 0.95, method = "matched",
fleiss = FALSE)

## $n.total
## [1] 456
## 
## $n.case
## [1] 228
## 
## $n.control
## [1] 228
## 
## $power
## [1] 0.9
## 
## $OR
## [1] 2

## A total of 456 subjects need to be enrolled in the study: 228 cases and
## 228 controls.

r <- 1
p0 = seq(from = 0.05, to = 0.95, length = 50)
OR <- seq(from = 1.05, to = 6, length = 100)
dat <- expand.grid(p0 = p0, OR = OR)
dat$n.total <- NA
for(i in 1:nrow(dat)){
dat$n.total[i] <- epi.ccsize(OR = dat$OR[i], p0 = dat$p0[i], n = NA,
power = 0.80, r = 1, rho = 0, design = 1, sided.test = 2,
conf.level = 0.95, method = "unmatched", fleiss = FALSE)$n.total
}
grid.n <- matrix(dat$n.total, nrow = length(p0))
breaks <- c(22:30,32,34,36,40,45,50,55,60,70,80,90,100,125,150,175,
200,300,500,1000)
par(mar = c(5,5,0,5), bty = "n")
contour(x = p0, y = OR, z = log10(grid.n), add = FALSE, levels = log10(breaks),
labels = breaks, xlim = c(0,1), ylim = c(1,6), las = 1, method = "flattest",
xlab = 'Proportion of controls exposed', ylab = "Minimum OR to detect")

library(ggplot2); library(directlabels)
p <- ggplot(data = dat, aes(x = p0, y = OR, z = n.total)) +
geom_contour(aes(colour = ..level..), breaks = breaks) +
  xlab("Proportion of controls exposed") +
ylab("Minimum OR to detect") +
xlim(0,1) +
ylim(1,6)
print(direct.label(p, list("far.from.others.borders", "calc.boxes",
"enlarge.box", hjust = 1, vjust = 1, box.color = NA,
fill = "transparent", "draw.rects")))

#A matched case control study is to be
## carried out to quantify the association between exposure A and an outcome B.
## Assume the prevalence of exposure in controls is 0.60 and the
## correlation between case and control exposures for matched pairs (rho) is
## 0.20 (moderate). Assuming an equal number of cases and controls, how many
## subjects need to be enrolled into the study to detect an odds ratio of 3.0
## with 0.80 power using a two-sided 0.05 test?
epi.ccsize(OR = 3.0, p0 = 0.60, n = NA, power = 0.80, r = 1, rho = 0.2,
design = 1, sided.test = 2, conf.level = 0.95, method = "matched",
fleiss = FALSE)

## $n.total
## [1] 162
## 
## $n.case
## [1] 81
## 
## $n.control
## [1] 81
## 
## $power
## [1] 0.8
## 
## $OR
## [1] 3

## A total of 162 subjects need to be enrolled in the study: 81 cases and
## 81 controls. How many cases and controls are required if we select three
## controls per case?
epi.ccsize(OR = 3.0, p0 = 0.60, n = NA, power = 0.80, r = 3, rho = 0.2,
design = 1, sided.test = 2, conf.level = 0.95, method = "matched",
fleiss = FALSE)

## $n.total
## [1] 204
## 
## $n.case
## [1] 51
## 
## $n.control
## [1] 153
## 
## $power
## [1] 0.8
## 
## $OR
## [1] 3

## A total of 204 subjects need to be enrolled in the study: 51 cases and
## 153 controls.

## A survey to estimate the total number of residents over 65 years of
## age that require the services of a nurse is to be carried out. There are
## five housing complexes in the study area and we expect that there might
## be a total of around 34 residents meeting this criteria (variance 6.8).
## We would like the estimated sample size to provide us with an estimate
## that is within 10% of the true value. How many housing complexes (clusters)
## should be sampled?
epi.cluster1size(n = 5, mean = 34, var = 6.8, epsilon.r = 0.10, method =
"total", conf.level = 0.999)

## [1] 3

## We would need to sample 3 housing complexes to meet the specifications
## for this study.

## We intend to conduct a survey of nurse practitioners to estimate the
## average number of patients seen by each nurse. There are five health
## centres in the study area, each with three nurses. We intend to sample
## two nurses from each health centre. We would like to be 95% confident
## that our estimate is within 30% of the true population value. We expect
## that the mean number of patients seen at the health centre level
## is 84 (var 567) and the mean number of patients seen at the nurse
## level is 28 (var 160). How many health centres should be sampled?
tn <- c(5, 3); tmean <- c(84, 28); tsigma2.x <- c(567, 160)
epi.cluster2size(nbar = 2, n = tn, mean = tmean, sigma2.x = tsigma2.x,
sigma2.y = NA, sigma2.xy = NA, epsilon.r = 0.3, method = "mean",
conf.level = 0.95)

## [1] 3

## Three health centres need to be sampled to meet the survey
## specifications.

## Same scenario as above, but this time we want to estimate the proportion
## of patients referred to a general practitioner from each clinic. As before,
## we want to be 95% confident that our estimate of the proportion of referred
## patients is within 30% of the true population value. We expect that
## approximately 36% of patients are referred.
clinic <- rep(1:5, each = 3)
nurse <- 1:15
Xij <- c(58,44,18,42,53,10,13,18,37,16,32,10,25,23,23)
Yij <- c(5,6,6,3,19,2,12,6,30,5,14,4,17,9,14)
ssudat <- data.frame(clinic, nurse, Xij, Yij)
Xbar <- by(data = ssudat$Xij, INDICES = ssudat$clinic, FUN = mean)
ssudat$Xbar <- rep(Xbar, each = 3)
Ybar <- by(data = ssudat$Yij, INDICES = ssudat$clinic, FUN = mean)
ssudat$Ybar <- rep(Ybar, each = 3)
ssudat$Xij.Xbar <- (ssudat$Xij - ssudat$Xbar)^2
ssudat$Yij.Ybar <- (ssudat$Yij - ssudat$Ybar)^2
ssudat$XY <- (ssudat$Xij - ssudat$Xbar) * (ssudat$Yij - ssudat$Ybar)
## Collapse the nurse-level data (created above) to the clinic level.
clinic <- as.vector(by(ssudat$clinic, INDICES = ssudat$clinic, FUN = min))
Xi <- as.vector(by(ssudat$Xij, INDICES = ssudat$clinic, FUN = sum))
Yi <- as.vector(by(ssudat$Yij, INDICES = ssudat$clinic, FUN = sum))
psudat <- data.frame(clinic, Xi, Yi)
psudat$Xi.Xbar <- (psudat$Xi - mean(psudat$Xi))^2
psudat$Yi.Ybar <- (psudat$Yi - mean(psudat$Yi))^2
psudat$XY <- (psudat$Xi - mean(psudat$Xi)) * (psudat$Yi - mean(psudat$Yi))
## Number of primary and secondary sampling units:
npsu <- nrow(psudat)
nssu <- mean(by(ssudat$nurse, INDICES = ssudat$clinic, FUN = length))
tn <- c(npsu, nssu)
## Mean of X at primary sampling unit and secondary sampling unit level:
tmean <- c(mean(psudat$Xi), mean(ssudat$Xij))
## Variance of number of patients seen:
tsigma2.x <- c(mean(psudat$Xi.Xbar), mean(ssudat$Xij.Xbar))
## Variance of number of patients referred:
tsigma2.y <- c(mean(psudat$Yi.Ybar), mean(ssudat$Yij.Ybar))
tsigma2.xy <- c(mean(psudat$XY), mean(ssudat$XY))
epi.cluster2size(nbar = 2, R = 0.36, n = tn, mean = tmean,
sigma2.x = tsigma2.x, sigma2.y = tsigma2.y, sigma2.xy = tsigma2.xy,
epsilon.r = 0.3, method = "proportion", conf.level = 0.95)

## [1] 2

## Two health centres need to be sampled to meet the survey
## specifications.

## We want to determine the prevalence of brucellosis in dairy cattle in a
## country comprised of 20 provinces. The number of dairy herds per province
## ranges from 50 to 1200. Herd size ranges from 25 to 900. We suspect that
## the prevalence of brucellosis-positive herds across the entire country
## is around 10%. We suspect that there are a small number of provinces
## with a relatively high individual cow-level prevalence of disease
## (thought to be between 40% and 80%). How many herds should be sampled
## from each province if we want our estimate of prevalence to be within
## 30% of the true population value?
epi.simplesize(N = 1200, Vsq = NA, Py = 0.10, epsilon.r = 0.30,
method = "proportion", conf.level = 0.95)

## [1] 291

## A total of 234 herds should be sampled from each province.
## Next we work out the number of provinces that need to be sampled.
## Again, we would like to be 95% confident that our estimate is within
## 30% of the true population value. Simulate some data to derive appropriate
## estimates of sigma2.x, sigma2.y and sigma2.xy.
## Number of herds per province:
npsu <- 20
nherds.p <- as.integer(runif(n = npsu, min = 50, max = 1200))
## Mean herd size per province:
hsize.p <- as.integer(runif(n = npsu, min = 25, max = 900))
## Simulate estimates of the cow-level prevalence of brucellosis in each
## province. Here we generate an equal mix of `low' and `high' brucellosis
## prevalence provinces:
prev.p <- c(runif(n = 15, min = 0, max = 0.05),
runif(n = 5, min = 0.40, max = 0.80))
## Generate some data:
prov <- c(); herd <- c();
Xij <- c(); Yij <- c();
Xbar <- c(); Ybar <- c();
Xij.Xbar <- c(); Yij.Ybar <- c()
for(i in 1:npsu){
## Province identifiers:
tprov <- rep(i, times = nherds.p[i])
prov <- c(prov, tprov)
## Herd identifiers:
therd <- 1:nherds.p[i]
herd <- c(herd, therd)
## Number of cows in each of the herds in this province:
tXij <- as.integer(rlnorm(n = nherds.p[i], meanlog = log(hsize.p[i]),
sdlog = 0.5))
tXbar <- mean(tXij)
tXij.Xbar <- (tXij - tXbar)^2
Xij <- c(Xij, tXij)
Xbar <- c(Xbar, rep(tXbar, times = nherds.p[i]))
Xij.Xbar <- c(Xij.Xbar, tXij.Xbar)
## Number of brucellosis-positive cows in each herd:
tYij <- c()
for(j in 1:nherds.p[i]){
ttYij <- rbinom(n = 1, size = tXij[j], prob = prev.p[i])
tYij <- c(tYij, ttYij)
}
tYbar <- mean(tYij)
tYij.Ybar <- (tYij - tYbar)^2
Yij <- c(Yij, tYij)
Ybar <- c(Ybar, rep(tYbar, times = nherds.p[i]))
Yij.Ybar <- c(Yij.Ybar, tYij.Ybar)
}
ssudat <- data.frame(prov, herd, Xij, Yij, Xbar, Ybar, Xij.Xbar, Yij.Ybar)
ssudat$XY <- (ssudat$Xij - ssudat$Xbar) * (ssudat$Yij - ssudat$Ybar)
## Collapse the herd-level data (created above) to the province level:
prov <- as.vector(by(ssudat$prov, INDICES = ssudat$prov, FUN = min))
Xi <- as.vector(by(ssudat$Xij, INDICES = ssudat$prov, FUN = sum))
Yi <- as.vector(by(ssudat$Yij, INDICES = ssudat$prov, FUN = sum))
psudat <- data.frame(prov, Xi, Yi)
psudat$Xi.Xbar <- (psudat$Xi - mean(psudat$Xi))^2
psudat$Yi.Ybar <- (psudat$Yi - mean(psudat$Yi))^2
psudat$XY <- (psudat$Xi - mean(psudat$Xi)) * (psudat$Yi - mean(psudat$Yi))
## Number of primary and secondary sampling units:
npsu <- nrow(psudat)
nssu <- round(mean(by(ssudat$herd, INDICES = ssudat$prov, FUN = length)),
digits = 0)
tn <- c(npsu, nssu)
## Mean of X at primary sampling unit and secondary sampling unit level:
tmean <- c(mean(psudat$Xi), mean(ssudat$Xij))
## Variance of herd size:
tsigma2.x <- c(mean(psudat$Xi.Xbar), mean(ssudat$Xij.Xbar))
## Variance of number of brucellosis-positive cows:
tsigma2.y <- c(mean(psudat$Yi.Ybar), mean(ssudat$Yij.Ybar))
tsigma2.xy <- c(mean(psudat$XY), mean(ssudat$XY))
## Finally, calculate the number of provinces to be sampled:
tR <- sum(psudat$Yi) / sum(psudat$Xi)
epi.cluster2size(nbar = 234, R = tR, n = tn, mean = tmean,
sigma2.x = tsigma2.x, sigma2.y = tsigma2.y, sigma2.xy = tsigma2.xy,
epsilon.r = 0.3, method = "proportion", conf.level = 0.95)

## [1] 2

## Four provinces (sampling 234 herds from each) are required to be 95%
## confident that our estimate of the individual animal prevalence of
## brucellosis is within 30% of the true population value.

## The expected prevalence of disease in a population of cattle is 0.10.
## We wish to conduct a survey, sampling 50 animals per farm. No data
## are available to provide an estimate of rho, though we suspect
## the intra-cluster correlation for this disease to be moderate.
## We wish to be 95% certain of being within 10% of the true population
## prevalence of disease. How many herds should be sampled?
p <- 0.10; b <- 50; D <- 4
rho <- (D - 1) / (b - 1)
epi.clustersize(p = 0.10, b = 50, rho = rho, epsilon.r = 0.10,
conf.level = 0.95)

## $clusters
## [1] 278
## 
## $units
## [1] 13900
## 
## $design
## [1] 4

## We need to sample 278 herds (13900 samples in total).

## A cross-sectional study is to be carried out to determine the prevalence
## of a given disease in a population using a two-stage cluster design. We
## estimate prevalence to be 0.20 and we expect rho to be in the order of 0.02.
## We want to take sufficient samples to be 95% certain that our estimate of
## prevalence is within 5% of the true population value (that is, a relative
## error of 0.05 / 0.20 = 0.25). Assuming 20 responses from each cluster,
## how many clusters do we need to be sample?
epi.clustersize(p = 0.20, b = 20, rho = 0.02, epsilon.r = 0.25,
conf.level = 0.95)

## $clusters
## [1] 18
## 
## $units
## [1] 360
## 
## $design
## [1] 1.38

## We need to sample 18 clusters (360 samples in total).

## A cohort study of smoking and coronary heart disease (CHD) in middle aged men
## is planned. A sample of men will be selected at random from the population
## and those that agree to participate will be asked to complete a
## questionnaire. The follow-up period will be 5 years. The investigators would
## like to be 0.90 sure of being able to detect when the risk ratio of CHD
## is 1.4 for smokers, using a 0.05 significance test. Previous evidence
## suggests that the incidence risk of death rate in non-smokers is 413 per
## 100,000 per year. Assuming equal numbers of smokers and non-smokers are
## sampled, how many men should be sampled overall?
e1 = 1.4 * (5 * 413)/100000; e0 = (5 * 413)/100000
epi.cohortsize(exposed = e1, unexposed = e0, n = NA, power = 0.90,
r = 1, design = 1, sided.test = 1, conf.level = 0.95)

## $n.total
## [1] 12130
## 
## $n.exposed
## [1] 6065
## 
## $n.unexposed
## [1] 6065
## 
## $power
## [1] 0.9
## 
## $lambda
## [1] 1.4

## A total of 12,130 men need to be sampled (6065 smokers and 6065 non-smokers).

## Say, for example, we are only able to enrol 5000 subjects into the study
## described above. What is the minimum and maximum detectable risk ratio?
e0 = (5 * 413)/100000
epi.cohortsize(exposed = NA, unexposed = e0, n = 5000, power = 0.90,
r = 1, design = 1, sided.test = 1, conf.level = 0.95)

## $n.total
## [1] 5000
## 
## $n.exposed
## [1] 2500
## 
## $n.unexposed
## [1] 2500
## 
## $power
## [1] 0.9
## 
## $lambda
## [1] 0.5047104 1.6537812

## The minimum detectable risk ratio >1 is 1.65. The maximum detectable
## risk ratio <1 is 0.50.


## A study is to be carried out to assess the effect of a new treatment for
## anoestrus in dairy cattle. What is the required sample size if we expect
## the proportion of cows responding in the treatment (exposed) group to be
## 0.30 and the proportion of cows responding in the control (unexposed) group
## to be 0.15? The required power for this study is 0.80 using a two-sided
## 0.05 test.
epi.cohortsize(exposed = 0.30, unexposed = 0.15, n = NA, power = 0.80,
r = 1, design = 1, sided.test = 2, conf.level = 0.95)

## $n.total
## [1] 242
## 
## $n.exposed
## [1] 121
## 
## $n.unexposed
## [1] 121
## 
## $power
## [1] 0.8
## 
## $lambda
## [1] 2

## A total of 242 cows are required: 121 in the treatment (exposed) group and
## 121 in the control (unexposed) group.
## Assume now that this study is going to be carried out using animals from a
## number of herds. What is the required sample size when you account for the
## observation that response to treatment is likely to cluster within herds.
## For the exercise, assume that the intra-cluster correlation coefficient
## (the rate of homogeneity, rho) for this treatment is 0.05 and the
## average number of cows sampled per herd will be 30.
## Calculate the design effect, given rho = (design - 1) / (nbar - 1),
## where nbar equals the average number of individuals per cluster:
design <- 0.05 * (30 - 1) + 1
epi.cohortsize(exposed = 0.30, unexposed = 0.15, n = NA, power = 0.80,
r = 1, design = design, sided.test = 2, conf.level = 0.95)

## $n.total
## [1] 592
## 
## $n.exposed
## [1] 296
## 
## $n.unexposed
## [1] 296
## 
## $power
## [1] 0.8
## 
## $lambda
## [1] 2

## A total of 592 cows are required for this study: 296 in the treatment group
## and 296 in the control group.

dat <- rnorm(n = 100, mean = 0, sd = 1)
epi.conf(dat, ctype = "mean.single")

##         est        se       lower     upper
## 1 0.1546891 0.1131938 -0.06991198 0.3792901

group <- c(rep("A", times = 5), rep("B", times = 5))
val = round(c(rnorm(n = 5, mean = 10, sd = 5),
rnorm(n = 5, mean = 7, sd = 5)), digits = 0)
dat <- data.frame(group = group, val = val)
epi.conf(dat, ctype = "mean.unpaired")

##   est       se     lower    upper
## 1   2 4.071855 -7.389714 11.38971

before <- c(148,142,136,134,138,140,132,144,128,170,162,150,138,154,126,116)
after <- c(152,152,134,148,144,136,144,150,146,174,162,162,146,156,132,126)
dat <- data.frame(before, after)
dat <- data.frame(cbind(before, after))
epi.conf(dat, ctype = "mean.paired", conf.level = 0.95)

##     est       se    lower    upper
## 1 6.625 1.491294 3.446382 9.803618

pos <- 81
neg <- (263 - 81)
dat <- as.matrix(cbind(pos, neg))
round(epi.conf(dat, ctype = "prop.single"), digits = 3)

##       est    se lower upper
## pos 0.308 0.028 0.255 0.366

grp1 <- matrix(cbind(5, 51), ncol = 2)
grp2 <- matrix(cbind(0, 29), ncol = 2)
dat <- as.matrix(cbind(grp1, grp2))
round(epi.conf(dat, ctype = "prop.unpaired"), digits = 3)

##     est    se  lower upper
## 1 0.089 0.038 -0.038 0.193

dat <- as.matrix(cbind(14, 5, 0, 22))
round(epi.conf(dat, ctype = "prop.paired", conf.level = 0.95), digits = 3)

##     est    se lower upper
## 1 0.122 0.051 0.011 0.226

pos <- 4; pop <- 200
dat <- as.matrix(cbind(pos, pop))
epi.conf(dat, ctype = "prevalence", method = "exact", N = 1000,
design = 1, conf.level = 0.95) * 100

##     est     lower    upper
## pos   2 0.5475566 5.041361

obs <- c(5, 10, 12, 18); pop <- c(234, 189, 432, 812)
dat <- as.matrix(cbind(obs, pop))
round(epi.conf(dat, ctype = "smr"), digits = 2)

##    est   se lower upper
## 1 0.79 2.42  0.26  1.85
## 2 1.96 2.69  0.94  3.60
## 3 1.03 1.78  0.53  1.80
## 4 0.82 1.30  0.49  1.30

## A survey has been conducted to determine the proportion of broilers
## protected from a given disease following vaccination. We assume that
## the intra-cluster correlation coefficient for protection (also known as the
## rate of homogeneity, rho) is 0.4 and the average number of birds per
## flock is 30. A total of 5898 birds from a total of 10363 were identified
## as protected. What proportion of birds are protected and what is the 95%
## confidence interval for this estimate?
## Calculate the design effect, given rho = (design - 1) / (nbar - 1), where
## nbar equals the average number of individuals sampled per cluster:
D <- 0.4 * (30 - 1) + 1
## The design effect is 12.6. Now calculate the proportion protected:
dat <- as.matrix(cbind(5898, 10363))
epi.conf(dat, ctype = "prevalence", method = "fleiss", N = 1000000,
design = D, conf.level = 0.95)

##         est         se     lower     upper
## 1 0.5691402 0.01717826 0.5354675 0.6028129

os.refs <- c("SJ505585","SJ488573","SJ652636")
epi.convgrid(os.refs)

## x.coord y.coord 
##  365200  363600

set.seed(seed = 1234)
obs <- round(runif(n = 100, min = 0, max = 1), digits = 0)
v1 <- round(runif(n = 100, min = 0, max = 4), digits = 0)
v2 <- round(runif(n = 100, min = 0, max = 4), digits = 0)
dat <- data.frame(obs, v1, v2)
dat.glm <- glm(obs ~ v1 + v2, family = binomial, data = dat)
dat.mf <- model.frame(dat.glm)
## Covariate pattern:
epi.cp(dat.mf[-1])

## $cov.pattern
##    id n v1 v2
## 1   1 3  0  3
## 2   2 9  2  2
## 3   3 5  1  1
## 4   4 6  1  3
## 5   5 7  1  2
## 9   6 7  2  1
## 10  7 1  0  4
## 11  8 9  3  2
## 12  9 4  0  1
## 13 10 5  4  1
## 16 11 2  4  4
## 17 12 5  4  3
## 19 13 1  0  2
## 20 14 6  3  3
## 22 15 2  4  0
## 33 16 3  1  0
## 34 17 8  2  3
## 35 18 3  4  2
## 40 19 1  3  4
## 45 20 2  1  4
## 54 21 4  3  1
## 62 22 1  2  0
## 65 23 2  0  0
## 66 24 3  3  0
## 83 25 1  2  4
## 
## $id
##   [1]  1  1  1  2  2  2  2  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  4
##  [24]  5  5  5  5  5  5  5  6  6  6  6  6  6  6  7  8  8  8  8  8  8  8  8
##  [47]  8  9  9  9  9 10 10 10 10 10 11 11 12 12 12 12 12 13 14 14 14 14 14
##  [70] 14 15 15 16 16 16 17 17 17 17 17 17 17 17 18 18 18 19 20 20 21 21 21
##  [93] 21 22 23 23 24 24 24 25

infert.glm <- glm(case ~ spontaneous + induced, data = infert,
family = binomial())
infert.mf <- model.frame(infert.glm)
infert.cp <- epi.cp(infert.mf[-1])
infert.obs <- as.vector(by(infert$case, as.factor(infert.cp$id),
FUN = sum))
infert.fit <- as.vector(by(fitted(infert.glm), as.factor(infert.cp$id),
FUN = min))
infert.res <- epi.cpresids(obs = infert.obs, fit = infert.fit,
covpattern = infert.cp)

id <- 1:1000
tmp <- rnorm(1000, mean = 0, sd = 1)
id <- sample(id, size = 20)
tmp[id] <- NA
epi.descriptives(tmp, conf.level = 0.95)

## $arithmetic
##      n        mean       sd        q25         q50       q75       lower
## 1 1000 -0.02758861 0.984321 -0.6632693 -0.01809424 0.6181177 -0.08867026
##        upper       min      max na
## 1 0.03349305 -3.396064 3.195901 20
## 
## $geometric
##     n     mean       sd       q25       q50      q75     lower     upper
## 1 503 0.492215 1.121644 0.2915952 0.6295361 1.076828 0.4462038 0.5429707
##           min      max na
## 1 0.001313548 3.195901 20
## 
## $symmetry
##      skewness  kurtosis
## 1 -0.09707511 0.3805558

## We would like to confirm the absence of disease in a single 1000-cow
## dairy herd. We expect the prevalence of disease in the herd to be 5%.
## We intend to use a single test with a sensitivity of 0.90 and a
## specificity of 0.80. How many samples should we take to be 95% certain
## that, if all tests are negative, the disease is not present?
epi.detectsize(N = 1000, prev = 0.05, se = 0.90, sp = 0.80, interpretation =
"series", covar = c(0,0), conf.level = 0.95, finite.correction = TRUE)

## $performance
##   sens spec
## 1  0.9  0.8
## 
## $sample.size
## [1] 59

## We would like to confirm the absence of disease in a study area. If the
## disease is present we expect the between-herd prevalence to be 8% and the
## within-herd prevalence to be 5%. We intend to use two tests: the first has
## a sensitivity and specificity of 0.90 and 0.80, respectively. The second
## has a sensitivity and specificity of 0.95 and 0.85, respectively. The two
## tests will be interpreted in parallel. How many herds and cows within herds
## should we sample to be 95% certain that the disease is not present in the
## study area if all tests are negative? There area is comprised of
## approximately 5000 herds and the average number of cows per herd is 100.
epi.detectsize(N = c(5000, 100), prev = c(0.08, 0.05), se = c(0.90, 0.95),
sp = c(0.80, 0.85), interpretation = "parallel", covar = c(0,0),
conf.level = 0.95, finite.correction = TRUE)

## $performance
##    sens spec
## 1 0.995 0.68
## 
## $sample.size
##   clusters units total
## 1       38    31  1178

## You want to document the absence of Mycoplasma from a 200-sow pig herd.
## Based on your experience and the literature, a minimum of 20% of sows
## would have seroconverted if Mycoplasma were present in the herd. How many
## sows do you need to sample?
epi.detectsize(N = 200, prev = 0.20, se = 1.00, sp = 1.00, conf.level = 0.95,
finite.correction = TRUE)

## $performance
##   sens spec
## 1    1    1
## 
## $sample.size
## [1] 12

## Suppose we are expecting the lower 5% and upper 95% confidence interval
## of relative risk in a data set to be 0.5 and 3.0, respectively.
## A prior guess at the precision of the heterogeneity term would be:
tau <- epi.dgamma(rr = c(0.5, 3.0), quantiles = c(0.05, 0.95))
tau

## [1] 3.370972

## This can be translated into a gamma distribution. We set the mean of the
## distribution as tau and specify a large variance (that is, we are not
## certain about tau).
mean <- tau
var <- 1000
shape <- mean^2 / var
inv.scale <- mean / var
## In WinBUGS the precision of the heterogeneity term may be parameterised
## as tau ~ dgamma(shape, inv.scale). Plot the probability density function
## of tau:
z <- seq(0.01, 10, by = 0.01)
fz <- dgamma(z, shape = shape, scale = 1/inv.scale)
plot(z, fz, type = "l", ylab = "Probability density of tau")

## A study was conducted to estimate the seroprevalence of leptospirosis
## in dogs in Glasgow and Edinburgh, Scotland. For the matrix titled pop
## the numbers represent dog-years at risk. The following data were
## obtained for male and female dogs:
dat <- data.frame(obs = c(15,46,53,16), tar = c(48,212,180,71),
sex = c("M","F","M","F"), city = c("ED","ED","GL","GL"))
obs <- matrix(dat$obs, nrow = 2, byrow = TRUE,
dimnames = list(c("ED","GL"), c("M","F")))
tar <- matrix(dat$tar, nrow = 2, byrow = TRUE,
dimnames = list(c("ED","GL"), c("M","F")))
std <- matrix(data = c(250,250), nrow = 1, byrow = TRUE,
dimnames = list("", c("M","F")))
## Compute directly adjusted seroprevalence estimates, using a standard
## population with equal numbers of male and female dogs:
std <- matrix(data = c(250,250), nrow = 1, byrow = TRUE,
dimnames = list("", c("M","F")))
epi.directadj(obs, tar, std, units = 1, conf.level = 0.95)

## $crude
##   strata cov       est     lower     upper
## 1     ED   M 0.3125000 0.1749039 0.5154212
## 2     GL   M 0.2944444 0.2205591 0.3851406
## 3     ED   F 0.2169811 0.1588575 0.2894224
## 4     GL   F 0.2253521 0.1288082 0.3659577
## 
## $crude.strata
##   strata       est     lower     upper
## 1     ED 0.2346154 0.1794622 0.3013733
## 2     GL 0.2749004 0.2138889 0.3479040
## 
## $adj.strata
##   strata       est     lower     upper
## 1     ED 0.2647406 0.1866047 0.3692766
## 2     GL 0.2598983 0.1964162 0.3406224

## The confounding effect of sex has been removed by the gender-adjusted
## incidence rate estimates.
## The adjusted incidence rate of leptospirosis in Glasgow dogs is 26 (95%
## CI 20 to 34) cases per 100 dog-years at risk.

dat.glm01 <- glm(obs ~ city, offset = log(tar), family = poisson, data = dat)
summary(dat.glm01)

## 
## Call:
## glm(formula = obs ~ city, family = poisson, data = dat, offset = log(tar))
## 
## Deviance Residuals: 
##       1        2        3        4  
##  1.0596  -0.5369   0.4944  -0.8222  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4498     0.1280 -11.323   <2e-16 ***
## cityGL        0.1585     0.1757   0.902    0.367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3.1463  on 3  degrees of freedom
## Residual deviance: 2.3315  on 2  degrees of freedom
## AIC: 26.991
## 
## Number of Fisher Scoring iterations: 4

dat.pred01 <- predict(object = dat.glm01, newdata =
data.frame(city = c("ED","GL"), tar = c(1,1)),
type = "link", se = TRUE)
conf.level <- 0.95
critval <- qnorm(p = 1 - ((1 - conf.level) / 2), mean = 0, sd = 1)
est <- dat.glm01$family$linkinv(dat.pred01$fit)
lower <- dat.glm01$family$linkinv(dat.pred01$fit -
(critval * dat.pred01$se.fit))
upper <- dat.glm01$family$linkinv(dat.pred01$fit +
(critval * dat.pred01$se.fit))
round(data.frame(est, lower, upper), 3)

##     est lower upper
## 1 0.235 0.183 0.302
## 2 0.275 0.217 0.348

dat.glm02 <- dat.glm02 <- glm(obs ~ city + sex, offset = log(tar),
family = poisson, data = dat)
dat.pred02 <- predict(object = dat.glm02, newdata =
data.frame(sex = c("F","F"), city = c("ED","GL"), tar = c(1,1)),
type = "link", se.fit = TRUE)
conf.level <- 0.95
critval <- qnorm(p = 1 - ((1 - conf.level) / 2), mean = 0, sd = 1)
est <- dat.glm02$family$linkinv(dat.pred02$fit)
lower <- dat.glm02$family$linkinv(dat.pred02$fit -
(critval * dat.pred02$se.fit))
upper <- dat.glm02$family$linkinv(dat.pred02$fit +
(critval * dat.pred02$se.fit))
round(data.frame(est, lower, upper), 3)

##     est lower upper
## 1 0.220 0.168 0.287
## 2 0.217 0.146 0.323

dat$pop <- dat$tar
dat.glm03 <- glm(cbind(obs, pop - obs) ~ city + sex,
family = "binomial", data = dat)
dat.pred03 <- predict(object = dat.glm03, newdata =
data.frame(sex = c("F","F"), city = c("ED","GL")),
type = "link", se.fit = TRUE)
conf.level <- 0.95
critval <- qnorm(p = 1 - ((1 - conf.level) / 2), mean = 0, sd = 1)
est <- dat.glm03$family$linkinv(dat.pred03$fit)
lower <- dat.glm03$family$linkinv(dat.pred03$fit -
(critval * dat.pred03$se.fit))
upper <- dat.glm03$family$linkinv(dat.pred03$fit +
(critval * dat.pred03$se.fit))
round(data.frame(est, lower, upper), 3)

##     est lower upper
## 1 0.220 0.172 0.276
## 2 0.217 0.150 0.304

#Mixed-effects meta-analysis of binary outcomes using the DerSimonian
#and Laird method
data(epi.epidural)
epi.dsl(ev.trt = epi.epidural$ev.trt, n.trt = epi.epidural$n.trt,
ev.ctrl = epi.epidural$ev.ctrl, n.ctrl = epi.epidural$n.ctrl,
names = as.character(epi.epidural$trial), method = "odds.ratio",
alternative = "two.sided", conf.level = 0.95)

## $OR
##         est     lower     upper
## 1 0.7847395 0.4824157 1.2765255
## 2 0.8104075 0.6523177 1.0068106
## 3 0.8670020 0.5729993 1.3118560
## 4 0.4983389 0.2573144 0.9651292
## 5 0.3365571 0.1988637 0.5695893
## 6 0.2602183 0.1840147 0.3679792
## 
## $OR.summary
##         est     lower     upper
## 1 0.5414421 0.3405653 0.8608026
## 
## $weights
##    inv.var      dsl
## 1 16.22741 2.903523
## 2 81.57450 3.389328
## 3 22.39579 3.054029
## 4  8.79260 2.521960
## 5 13.87652 2.818098
## 6 31.99457 3.184304
## 
## $heterogeneity
##            Q           df      p.value 
## 4.035785e+01 5.000000e+00 1.264680e-07 
## 
## $Hsq
##        est    lower    upper
## 1 8.071571 6.200311 10.50758
## 
## $Isq
##        est    lower    upper
## 1 87.61084 83.87178 90.48306
## 
## $tau.sq
## [1] 0.282785
## 
## $effect
##            z      p.value 
## -2.593615115  0.009497274

set.seed(123)
dat <- rpois(n = 50, lambda = 2)
edr.04 <- epi.edr(dat, n = 4, conf.level = 0.95, nsim = 99, na.zero = TRUE)
sdate <- as.Date(x = "31/12/2015", format = "%d/%m/%Y")
dat.04 <- data.frame(idate = sdate + 1:50, est = edr.04$est,
low = edr.04$lower, upp = edr.04$upper)
## Line plot of EDR (and its 95% confidence interval) as a function of
## calendar time:
library(ggplot2)
ggplot(dat.04, aes(x = as.integer(idate), y = est)) +
geom_line() +
geom_line(dat = dat.04, aes(x = as.integer(idate), y = upp),
lty = 3, size = 0.5) +
geom_line(dat = dat.04, aes(x = as.integer(idate), y = low),
lty = 3, size = 0.5) +
scale_x_continuous(name = "Date",
breaks = seq(from = min(as.integer(dat.04$idate)),
to = max(as.integer(dat.04$idate)), by = 7),
labels = seq(from = min(dat.04$idate),
to = max(dat.04$idate), by = 7),
limits = c(min(as.integer(dat.04$idate)),
max(as.integer(dat.04$idate)))) +
scale_y_continuous(name = "Estimated disemination ratio (EDR)",
limits = c(0,10)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 10)) +
geom_hline(yintercept = 1, lty = 2)

#empirical Bayes estimates
data(epi.SClip)
obs <- epi.SClip$cases; pop <- epi.SClip$population
est <- epi.empbayes(obs, pop)
crude.p <- ((obs) / (pop)) * 100000
crude.r <- rank(crude.p)
ebay.p <- ((obs + est[4]) / (pop + est[3])) * 100000
dat <- data.frame(rank = c(crude.r, crude.r),
Method = c(rep("Crude", times = length(crude.r)),
rep("Empirical Bayes", times = length(crude.r))),
est = c(crude.p, ebay.p))

library(ggplot2)
ggplot(dat, aes(x = rank, y = est, colour = Method)) +
geom_point() +
ylab("Lip cancer incidence rates (cases per 100,000 person years)") +
scale_x_continuous(name = "District rank",
breaks = seq(from = 0, to = 60, by = 10),
labels = seq(from = 0, to = 60, by = 10),
limits = c(0,60)) +
ylim(0,30)

## Warning: Removed 1 rows containing missing values (geom_point).

## Bennett, Dismukes, Duma et al. (1979) designed a clinical trial to test
## whether combination chemotherapy for a shorter period would be at least
## as good as conventional therapy for patients with cryptococcal meningitis.
## They recruited 39 patients to each treatment arm and wished to conclude
## that a difference of less than 20% in response rate between the treatments
## would indicate equivalence. Assuming a one-sided test size of 10%, a
## power of 80% and an overall response rate of 50%, what would be a
## realistic sample size if the trial were to be repeated?
epi.equivb(treat = 0.50, control = 0.50, delta = 0.20, n = NA, r = 1,
power = 0.80, alpha = 0.10)

## $n.treat
## [1] 83
## 
## $n.control
## [1] 83
## 
## $n.total
## [1] 166
## 
## $power
## [1] 0.8

## It is anticipated that patients on a particular drug have a mean diastolic
## blood pressure of 96 mmHg, as against 94 mmHg on an alternative. It is also
## anticipated that the standard deviation of diastolic BP is approximately
## 8 mmHg. If one wishes to confirm that the difference is likely to be less
## than 5 mmHg, that is, one wishes to show equivalence, how many patients
## are need to be enrolled in the trial? Assume 80% power and
## 95% significance.
epi.equivc(treat = 94, control = 96, sd = 8, delta = 5, n = NA,
r = 1, power = 0.80, alpha = 0.05)

## $n.treat
## [1] 122
## 
## $n.control
## [1] 122
## 
## $n.total
## [1] 244

## A pharmaceutical company is interested in conducting a clinical trial
## to compare two cholesterol lowering agents for treatment of patients with
## congestive heart disease using a parallel design. The primary efficacy
## parameter is the LDL. In what follows, we will consider the situation
## where the intended trial is for testing equivalence of mean responses
## in LDL. Assume that 80% power is required at a 5% level of significance.
## In this example, we assume a 5 unit (i.e. delta = 5) change of LDL is
## considered of clinically meaningful difference. Assume the standard
## of LDL is 10 units and the LDL concentration in the treatment group is 20
## units and the LDL concentration in the control group is 21 units.
epi.equivc(treat = 20, control = 21, sd = 10, delta = 5, n = NA,
r = 1, power = 0.80, alpha = 0.05)

## $n.treat
## [1] 108
## 
## $n.control
## [1] 108
## 
## $n.total
## [1] 216

## Suppose only 150 subjects were enrolled in the trial, 75 in the treatment
## group and 75 in the control group. What is the estimated study power?
epi.equivc(treat = 0.20, control = 0.21, sd = 0.10, delta = 0.05, n = 150,
r = 1, power = NA, alpha = 0.05)

## $n.treat
## [1] 75
## 
## $n.control
## [1] 75
## 
## $n.total
## [1] 150
## 
## $power
## [1] 0.5790126

obs <- matrix(data = c(58, 130), nrow = 2, byrow = TRUE,
dimnames = list(c("A", "B"), ""))
pop <- matrix(data = c(1000, 2000), nrow = 2, byrow = TRUE,
dimnames = list(c("A", "B"), ""))
std <- 0.060
epi.indirectadj(obs = obs, pop = pop, std = std, units = 100,
conf.level = 0.95)

## $crude.strata
##   est    lower    upper
## A 5.8 4.404183 7.497845
## B 6.5 5.430733 7.718222
## 
## $adj.strata
##   est lower upper
## A 5.8   4.4  7.30
## B 6.5   5.4  7.65
## 
## $smr.strata
##   obs exp       est     lower    upper
## A  58  60 0.9666667 0.7333333 1.216667
## B 130 120 1.0833333 0.9000000 1.275000

obs <- matrix(data = c(58, 130), nrow = 2, byrow = TRUE,
dimnames = list(c("A", "B"), ""))
pop <- matrix(data = c(550, 450, 500, 1500), nrow = 2, byrow = TRUE,
dimnames = list(c("A", "B"), c("Beef", "Dairy")))
std <- matrix(data = c(0.025, 0.085, 0.060), nrow = 1, byrow = TRUE,
dimnames = list("", c("Beef", "Dairy", "Total")))
epi.indirectadj(obs = obs, pop = pop, std = std, units = 100,
conf.level = 0.95)

## $crude.strata
##   est    lower    upper
## A 5.8 4.404183 7.497845
## B 6.5 5.430733 7.718222
## 
## $adj.strata
##        est    lower    upper
## A 6.692308 5.076923 8.423077
## B 5.571429 4.628571 6.557143
## 
## $smr.strata
##   obs exp       est     lower    upper
## A  58  52 1.1153846 0.8461538 1.403846
## B 130 140 0.9285714 0.7714286 1.092857

require(survival)
ovarian.km <- survfit(Surv(futime,fustat) ~ 1, data = ovarian)
ovarian.haz <- epi.insthaz(ovarian.km, conf.level = 0.95)
plot(ovarian.haz$time, ovarian.haz$est, xlab = "Days",
ylab = "Instantaneous hazard", type = "b", pch = 16)

can <- c(rep(1, times = 231), rep(0, times = 178), rep(1, times = 11),
rep(0, times = 38))
smk <- c(rep(1, times = 225), rep(0, times = 6), rep(1, times = 166),
rep(0, times = 12), rep(1, times = 8), rep(0, times = 3), rep(1, times = 18),
rep(0, times = 20))
alc <- c(rep(1, times = 409), rep(0, times = 49))
dat <- data.frame(alc, smk, can)

dat.glm01 <- glm(can ~ alc + smk + alc:smk, family = binomial, data = dat)
summary(dat.glm01)

## 
## Call:
## glm(formula = can ~ alc + smk + alc:smk, family = binomial, data = dat)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.309  -1.309   1.051   1.051   2.018  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -1.89712    0.61914  -3.064  0.00218 **
## alc          1.20397    0.79582   1.513  0.13031   
## smk          1.08619    0.75092   1.446  0.14805   
## alc:smk     -0.08893    0.90794  -0.098  0.92197   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 633.45  on 457  degrees of freedom
## Residual deviance: 605.93  on 454  degrees of freedom
## AIC: 613.93
## 
## Number of Fisher Scoring iterations: 4

dat$d <- rep(NA, times = nrow(dat))
dat$d[dat$alc == 0 & dat$smk == 0] <- 0
dat$d[dat$alc == 1 & dat$smk == 0] <- 1
dat$d[dat$alc == 0 & dat$smk == 1] <- 2
dat$d[dat$alc == 1 & dat$smk == 1] <- 3
dat$d <- factor(dat$d)

dat.glm02 <- glm(can ~ d, family = binomial, data = dat)
summary(dat.glm02)

## 
## Call:
## glm(formula = can ~ d, family = binomial, data = dat)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.309  -1.309   1.051   1.051   2.018  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8971     0.6191  -3.064 0.002183 ** 
## d1            1.2040     0.7958   1.513 0.130313    
## d2            1.0862     0.7509   1.446 0.148045    
## d3            2.2012     0.6275   3.508 0.000452 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 633.45  on 457  degrees of freedom
## Residual deviance: 605.93  on 454  degrees of freedom
## AIC: 613.93
## 
## Number of Fisher Scoring iterations: 4

epi.interaction(model = dat.glm02, coeff = c(2,3,4), type = "RERI",
conf.level = 0.95)

##        est     lower    upper
## 1 3.739848 -1.836675 9.316372

epi.interaction(model = dat.glm02, coeff = c(2,3,4), type = "APAB",
conf.level = 0.95)

##         est       lower     upper
## 1 0.4138765 -0.07306308 0.9008162

epi.interaction(model = dat.glm02, coeff = c(2,3,4), type = "S",
conf.level = 0.95)

##        est     lower    upper
## 1 1.870482 0.6460433 5.415585

data(epi.epidural)
epi.iv(ev.trt = epi.epidural$ev.trt, n.trt = epi.epidural$n.trt,
ev.ctrl = epi.epidural$ev.ctrl, n.ctrl = epi.epidural$n.ctrl,
names = as.character(epi.epidural$trial), method = "odds.ratio",
alternative = "two.sided", conf.level = 0.95)

## $OR
##         est     lower     upper
## 1 0.7847395 0.4824157 1.2765255
## 2 0.8104075 0.6523177 1.0068106
## 3 0.8670020 0.5729993 1.3118560
## 4 0.4983389 0.2573144 0.9651292
## 5 0.3365571 0.1988637 0.5695893
## 6 0.2602183 0.1840147 0.3679792
## 
## $OR.summary
##        est     lower     upper
## 1 0.602538 0.5195342 0.6988029
## 
## $weights
##        raw  inv.var
## 1 16.22741 16.22741
## 2 81.57450 81.57450
## 3 22.39579 22.39579
## 4  8.79260  8.79260
## 5 13.87652 13.87652
## 6 31.99457 31.99457
## 
## $heterogeneity
##            Q           df      p.value 
## 3.884314e+01 5.000000e+00 2.553888e-07 
## 
## $Hsq
##        est    lower    upper
## 1 7.768627 5.961781 10.12308
## 
## $Isq
##        est    lower    upper
## 1 87.12771 83.22649 90.12158
## 
## $effect
##             z       p.value 
## -6.699094e+00  2.097162e-11

dat <- as.table(matrix(c(19,10,6,256), nrow = 2, byrow = TRUE))
colnames(dat) <- c("L1-pos","L1-neg")
rownames(dat) <- c("L2-pos","L2-neg")
epi.kappa(dat, method = "fleiss", alternative = "greater", conf.level = 0.95)

## $prop.agree
##         obs       exp
## 1 0.9450172 0.8315561
## 
## $pindex
##         est         se      lower      upper
## 1 -0.814433 0.02394423 -0.8613628 -0.7675032
## 
## $bindex
##         est        se       lower      upper
## 1 0.0137457 0.0240457 -0.03338301 0.06087442
## 
## $pabak
##         est     lower     upper
## 1 0.8900344 0.8244906 0.9364991
## 
## $kappa
##         est         se     lower     upper
## 1 0.6735838 0.05842553 0.5590719 0.7880958
## 
## $z
##   test.statistic      p.value
## 1       11.52893 4.715392e-31
## 
## $mcnemar
##   test.statistic df   p.value
## 1              1  1 0.3173105

dat <- as.table(matrix(c(596,61,29,987), nrow = 2, byrow = TRUE))
colnames(dat) <- c("US-pos","US-neg")
rownames(dat) <- c("ELISA-pos","ELISA-neg")
epi.kappa(dat, method = "watson", alternative = "greater", conf.level = 0.95)

## $prop.agree
##         obs       exp
## 1 0.9462044 0.5271277
## 
## $pindex
##          est         se      lower      upper
## 1 -0.2337119 0.01678318 -0.2666063 -0.2008175
## 
## $bindex
##          est         se      lower      upper
## 1 0.01912732 0.01680567 -0.0138112 0.05206583
## 
## $pabak
##         est     lower     upper
## 1 0.8924088 0.8685749 0.9130464
## 
## $kappa
##         est         se     lower    upper
## 1 0.8862366 0.01166471 0.8633742 0.909099
## 
## $z
##   test.statistic p.value
## 1       75.97588       0
## 
## $mcnemar
##   test.statistic df      p.value
## 1       11.37778  1 0.0007432799

epi.meansize(treat = 5, control = 4.5, n = NA, sigma = 1.4, power = 0.95,
r = 1, design = 1, sided.test = 1, conf.level = 0.95)

## $n.total
## [1] 340
## 
## $n.treat
## [1] 170
## 
## $n.control
## [1] 170
## 
## $power
## [1] 0.95
## 
## $delta
## [1] 0.5

epi.meansize(treat = 8, control = 4, n = NA, sigma = 4, power = 0.90,
r = 4, design = 1, sided.test = 2, conf.level = 0.95)

## $n.total
## [1] 70
## 
## $n.treat
## [1] 56
## 
## $n.control
## [1] 14
## 
## $power
## [1] 0.9
## 
## $delta
## [1] 4

data(epi.epidural)
epi.mh(ev.trt = epi.epidural$ev.trt, n.trt = epi.epidural$n.trt,
ev.ctrl = epi.epidural$ev.ctrl, n.ctrl = epi.epidural$n.ctrl,
names = as.character(epi.epidural$trial), method = "odds.ratio",
alternative = "two.sided", conf.level = 0.95)

## $OR
##         est     lower     upper
## 1 0.7847395 0.4824157 1.2765255
## 2 0.8104075 0.6523177 1.0068106
## 3 0.8670020 0.5729993 1.3118560
## 4 0.4983389 0.2573144 0.9651292
## 5 0.3365571 0.1988637 0.5695893
## 6 0.2602183 0.1840147 0.3679792
## 
## $OR.summary
##         est     lower     upper
## 1 0.5949847 0.4220265 0.6594912
## 
## $weights
##        raw  inv.var
## 1 18.31818 16.22741
## 2 90.62500 81.57450
## 3 24.06780 22.39579
## 4 12.45517  8.79260
## 5 25.09709 13.87652
## 6 65.28729 31.99457
## 
## $heterogeneity
##            Q           df      p.value 
## 4.193079e+01 5.000000e+00 6.083437e-08 
## 
## $Hsq
##        est    lower    upper
## 1 8.386159 6.448683 10.90574
## 
## $Isq
##        est    lower    upper
## 1 88.07559 84.49296 90.83052
## 
## $effect
##             z       p.value 
## -5.615455e+00  1.960456e-08

epi.nomogram(se = 0.89, sp = 0.85, lr = NA, pre.pos = 0.05, verbose = FALSE)

## Given a positive test result, the post-test probability of being disease positive is 0.24 
## Given a negative test result, the post-test probability of being disease negative is 0.0068

pre.pos <- 3 / (3 * 52)
epi.nomogram(se = NA, sp = NA, lr = c(9000, 0.1), pre.pos = pre.pos,
verbose = FALSE)

## Given a positive test result, the post-test probability of being disease positive is 0.99 
## Given a negative test result, the post-test probability of being disease negative is 0.002

epi.noninfb(treat = 0.85, control = 0.65, delta = 0.10, n = NA, r = 1,
power = 0.80, alpha = 0.025)

## $n.treat
## [1] 279
## 
## $n.control
## [1] 279
## 
## $n.total
## [1] 558

epi.noninfb(treat = 0.85, control = 0.65, delta = 0.10, n = 400, r = 1,
power = NA, alpha = 0.025)

## $n.treat
## [1] 200
## 
## $n.control
## [1] 200
## 
## $n.total
## [1] 400
## 
## $power
## [1] 0.6604236

epi.noninfc(treat = 0.20, control = 0.20, sd = 0.10, delta = 0.05, n = NA,
r = 1, power = 0.80, alpha = 0.05)

## $n.treat
## [1] 50
## 
## $n.control
## [1] 50
## 
## $n.total
## [1] 100

set.seed(1234)
p <- runif(10, 0, 1)
x <- replicate(n = 5, expr = rbinom(10, 4, p) + 1)
rval <- epi.occc(dat = x, pairs = TRUE)
print(rval); summary(rval)

## 
## Overall CCC           0.6306
## Overall precision     0.6563
## Overall accuracy      0.9609

##        occc     oprec     oaccu
## 1 0.6305961 0.6562845 0.9608579

epi.pooled(se = 0.647, sp = 0.981, P = 0.12, m = 5 , r = 6)

## $HAPneg
## [1] 0.4375677
## 
## $HSe
## [1] 0.9272036
## 
## $HSp
## [1] 0.5624323

epi.popsize(T1 = 400, T2 = 400, T12 = 40, conf.level = 0.95, verbose = FALSE)

## 
##  Estimated population size: 4000 (95% CI 3125 - 5557)
##  Estimated number of untested subjects: 3240 (95% CI 2365 - 4797)

## Create a matrix of simulation results:
x1 <- data.frame(rnorm(n = 10, mean = 120, sd = 10))
x2 <- data.frame(rnorm(n = 10, mean = 80, sd = 5))
x3 <- data.frame(rnorm(n = 10, mean = 40, sd = 20))
y <- 2 + (0.5 * x1) + (0.7 * x2) + (0.2 * x3)
dat <- data.frame(cbind(X1 = x1, X2 = x2, X3 = x3, Y = y))
epi.prcc(dat, sided.test = 2)

##       gamma test.statistic df    p.value
## 1 0.4557694       1.448280  8 0.18557047
## 2 0.6290188       2.288599  8 0.05137712
## 3 0.7313772       3.033326  8 0.01622712

epi.prev(pos = 23, tested = 150, se = 0.96, sp = 0.89, method = "blaker",
units = 100, conf.level = 0.95)

## $ap
##        est   lower    upper
## 1 15.33333 10.0952 21.84397
## 
## $tp
##        est lower    upper
## 1 5.098039     0 12.75762

epi.prev(pos = 6, tested = 151, se = 0.964, sp = 0.927, method = "c-p",
         units = 100, conf.level = 0.95)

## $ap
##       est    lower    upper
## 1 3.97351 1.471945 8.447802
## 
## $tp
##   est lower    upper
## 1   0     0 1.288218

rval <- epi.prev(pos = c(8,12,7), tested = c(210,189,124),
se = c(0.60,0.65,0.70), sp = c(0.90,0.95,0.99), method = "blaker",
units = 100, conf.level = 0.95)
round(rval$tp, digits = 3)

##     est lower upper
## 1 0.000 0.000 2.051
## 2 2.249 0.000 9.447
## 3 6.732 0.998 8.997

epi.propsize(treat = 0.30, control = 0.32, n = NA, power = 0.90,
r = 1, design = 1, sided.test = 1, conf.level = 0.95)

## $n.total
## [1] 18316
## 
## $n.treat
## [1] 9158
## 
## $n.control
## [1] 9158
## 
## $power
## [1] 0.9
## 
## $lambda
## [1] 0.9375

epi.propsize(treat = NA, control = 0.32, n = 10000, power = 0.90,
r = 1, design = 1, sided.test = 1, conf.level = 0.95)

## $n.total
## [1] 10000
## 
## $n.treat
## [1] 5000
## 
## $n.control
## [1] 5000
## 
## $power
## [1] 0.9
## 
## $lambda
## [1] 0.9156443 1.0862238

pmean <- 1500 * 0.05; pvar <- (300 * 0.05)^2
epi.simplesize(N = 20, Vsq = (pvar / pmean^2), Py = NA, epsilon.r = 0.20,
method = "total", conf.level = 0.95)

## [1] 3

epi.simplesize(N = 278, Vsq = 30^2 / 200^2, Py = NA, epsilon.r = 10/200,
method = "mean", conf.level = 0.95)

## [1] 31

n.crude <- epi.simplesize(N = 1E+06, Vsq = NA, Py = 0.15, epsilon.r = 0.20,
method = "proportion", conf.level = 0.95)
n.crude

## [1] 544

rho <- 0.09; nbar <- 20
D <- rho * (nbar - 1) + 1
n.adj <- ceiling(n.crude * D)
n.adj

## [1] 1475

## A systematic review comparing assertive community treatment (ACT) for the
## severely mentally ill was compared to standard care. A systematic review
## comparing ACT to standard care found three trials that assessed mental
## status after 12 months. All three trials used a different scoring system,
## so standardisation is required before they can be compared.
names <- c("Audini", "Morse", "Lehman")
mean.trt <- c(41.4, 0.95, -4.10)
mean.ctrl <- c(42.3, 0.89, -3.80)
sd.trt <- c(14, 0.76, 0.83)
sd.ctrl <- c(12.4, 0.65, 0.87)
n.trt <- c(30, 37, 67)
n.ctrl <- c(28, 35, 58)
epi.smd(mean.trt, sd.trt, n.trt, mean.ctrl, sd.ctrl, n.ctrl,
names, method = "cohens", conf.level = 0.95)

## $md
##           est      lower        upper
## 1 -0.06791065 -0.5830822 0.4472609432
## 2  0.08466121 -0.3776978 0.5470202150
## 3 -0.35345223 -0.7077376 0.0008331048
## 
## $md.invar
##          est      lower      upper
## 1 -0.1630326 -0.4098699 0.08380461
## 
## $md.dsl
##         est      lower     upper
## 1 -0.154154 -0.4243178 0.1160098
## 
## $heterogeneity
##         Q        df   p.value 
## 2.3431570 2.0000000 0.3098774

strata.n <- c(600, 500, 400)
strata.mean <- c(0.164, 0.166, 0.236)
strata.var <- c(0.245, 0.296, 0.436)
epi.stratasize(strata.n, strata.mean, strata.var, strata.Py,
epsilon.r = 0.20, method = "mean", conf.level = 0.95)

## $strata.sample
## [1] 223 186 149
## 
## $total.sample
## [1] 558
## 
## $stats
##           mean     sigma.bx  sigma.wx   sigma.x  rel.var       gamma
## [1,] 0.1838667 0.0009890489 0.3129333 0.3139224 9.285735 0.003160574

strata.n <- c(1500, 2500, 4000)
strata.Py <- c(0.50, 0.60, 0.70)
epi.stratasize(strata.n, strata.mean, strata.var, strata.Py,
epsilon.r = 0.20, method = "proportion", conf.level = 0.95)

## $strata.sample
## [1] 10 17 27
## 
## $total.sample
## [1] 54
## 
## $stats
##         mean sigma.bx sigma.bxd       phi
## [1,] 0.63125 36386460  4554.848 0.1969654
## [2,] 0.63125 36386460  4554.848 0.3216432
## [3,] 0.63125 36386460  4554.848 0.4813914

epi.supb(treat = 0.85, control = 0.65, delta = 0.05, n = NA,
r = 1, power = 0.80, alpha = 0.05)

## $n.treat
## [1] 98
## 
## $n.control
## [1] 98
## 
## $n.total
## [1] 196

epi.supc(treat = 20, control = 20, sd = 10, delta = 5, n = NA,
r = 1, power = 0.80, alpha = 0.05)

## $n.treat
## [1] 50
## 
## $n.control
## [1] 50
## 
## $n.total
## [1] 100

epi.survivalsize(treat = 0.45, control = 0.30, n = NA, power = 0.90,
r = 1, design = 1, sided.test = 2, conf.level = 0.95)

## $n.crude
## [1] 250
## 
## $n.total
## [1] 250
## 
## $n.treat
## [1] 125
## 
## $n.control
## [1] 125

epi.survivalsize(treat = NA, control = NA, n = 500, power = 0.90,
r = 1, design = 1, sided.test = 2, conf.level = 0.95)

## $hazard
## [1] 0.748316 1.336334

dat <- as.table(matrix(c(670,202,74,640), nrow = 2, byrow = TRUE))
colnames(dat) <- c("Dis+","Dis-")
rownames(dat) <- c("Test+","Test-")
rval <- epi.tests(dat, conf.level = 0.95)
print(rval); summary(rval)

##           Outcome +    Outcome -      Total
## Test +          670          202        872
## Test -           74          640        714
## Total           744          842       1586
## 
## Point estimates and 95 % CIs:
## ---------------------------------------------------------
## Apparent prevalence                    0.55 (0.52, 0.57)
## True prevalence                        0.47 (0.44, 0.49)
## Sensitivity                            0.90 (0.88, 0.92)
## Specificity                            0.76 (0.73, 0.79)
## Positive predictive value              0.77 (0.74, 0.80)
## Negative predictive value              0.90 (0.87, 0.92)
## Positive likelihood ratio              3.75 (3.32, 4.24)
## Negative likelihood ratio              0.13 (0.11, 0.16)
## ---------------------------------------------------------

##                 est      lower      upper
## aprev     0.5498108  0.5249373  0.5744996
## tprev     0.4691047  0.4443055  0.4940184
## se        0.9005376  0.8767462  0.9210923
## sp        0.7600950  0.7297765  0.7885803
## diag.acc  0.8259773  0.8064049  0.8443346
## diag.or  28.6861119 21.5181917 38.2417364
## nnd       1.5137005  1.4091004  1.6487431
## youden    0.6606326  0.6065226  0.7096726
## ppv       0.7683486  0.7388926  0.7959784
## npv       0.8963585  0.8716393  0.9177402
## plr       3.7537262  3.3206884  4.2432346
## nlr       0.1308552  0.1050643  0.1629771

Epidemiology in R

Kushan De Silva

December 26, 2017