# Installing some useful packages
library(tidyverse)
library(kableExtra)
library(maptools)
library(rgdal)
library(plyr)
library(ggpubr) # Combine ggplot
(HIV diagnosis) Suppose there are two HIV test: ELISA is less expensive but less accurate; Western blot(WB) is more accurate but more expensive. The prevalence of HIV is 0.00148. Given HIV, the sensitivity and specificity of both tests are
\[
\begin{align}
Sensitivity:\ &P(ELISA\ test\ +|HIV) = 0.93,\quad
P(WB\ test\ + |HIV) = 0.999 \\
Specificity:\ &P(ELISA\ test\ -|no-HIV) = 0.99,\quad P(WB\ test\ -|no-HIV) = 0.991.
\end{align}
\] The following procedures are adopted in the US military in early years:
Write down all possible outcomes at each stage.
\(Let\left\{\begin{aligned} &H\ be\ detect\ HIV\ + \\ &T\ be\ detect\ HIV\ - \end{aligned} \right.\quad and\ Outcomes\ Set=\{HIV, ELSA_{1}, ELSA_{2-1},ELSA_{2-2}, WB_{1-1}, WB_{1-2}\}\)
則每一步stage 的outcome如下
rbind(c("{H}, {T}"),
c("{H, H}, {H, T}, {T, H}, {T, T}"),
c("{H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T},
{T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T}"),
c("{H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T},
{H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T},
{H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T},
{T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T},
{T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T},
{T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T},
")) %>%
`colnames<-`(. , "Outcomes") %>%
`rownames<-`(paste("Stage", seq(1,4))) %>%
kable(., "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Outcomes | |
---|---|
Stage 1 | {H}, {T} |
Stage 2 | {H, H}, {H, T}, {T, H}, {T, T} |
Stage 3 |
{H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T}, {T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T} |
Stage 4 |
{H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T}, {H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T}, {H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T}, {T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T}, {T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T}, {T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T}, |
At j stage (j = 1, 2, 3), calculate the probability of making false suspicion: \[ \begin{align} P (no-HIV |need\ to\ continue\ for\ the\ j + 1\ stage) \end{align} \] According to the above the statement, we know \[ \begin{align} j=1,\ P(no-HIV |continue\ for\ the\ 2\ stage) &= \frac{0.99852*0.01}{0.00148*0.93+0.99852*0.01} \\ &\approx 0.8788551 \\ j=2,\ P(no-HIV |continue\ for\ the\ 3\ stage) &= \frac{0.8788551*[0.01^2+2*(0.01*0.99)]}{0.1211449*[0.93^2+2*(0.93*0.07)]+0.8788551*[0.01^2+2*(0.01*0.99)]} \\ &\approx 0.1266963 \\ j=3,\ P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\ &\approx 0.0000118 \\ \end{align} \] according to the above information, we can summarize to below table
False Discovery Rate | |
---|---|
Stage 1 | 0.8788551 |
Stage 2 | 0.1266963 |
Stage 3 | 0.0000118 |
Calculate the false discovery rate at the final stage, i.e. P (no HIV | determine HIV infection).
According to the above the statement, we need to find \(P(no-HIV| determine\ HIV\ infection)\), and we know the probability means that \(P(no-HIV |continue\ for\ the\ 4\ stage)\). so, the probability is \[
\begin{align}
P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\
&\approx 0.0000118 \\
\end{align}
\]
Following the Bayesian analysis for the cancer rate example in BDA3 (section 2.7), develop a Poisson model for the cancer rate data in Taiwan (data set is provided in iLMS). You may select the data for a specific cancer in a year according to your preference.
Use the notations in your write-up:
\(y_{j}\) is the number of cancer counts in county j at a particular year
\(?c{j}\) is the underlying incidence rate for j-th county (in units: 1 per 100,000 persons per year)
\(n_{j}\) is the population of jth county
According to the above the statement, we can extract the informative column, and input the geographical information
Note: we need the informative column
Cancer type
: Stomach
Gender
: Regardless of men and women Diagnosis
:2015
# 1. Loading geographical information
setwd("/Users/linweixiang/R/Bayesian Analysis/exercise/taiwan shape files")
TW_shape <- readShapePoly("county_WGS84.shp")
TW_shape@data$countyname <- TW_shape@data$countyname %>%
iconv(., "big5", "utf8") # Convert BIG-5 to UTF-8
TW_Points <- fortify(TW_shape) # geographical information (data.frame)
# plot(TW_shape) # plot geographical information
# 2. Loading dataset
setwd("/Users/linweixiang/R/Bayesian Analysis/Dataset/HW1")
data.hw1 <- read.table("台灣癌症數據.csv",
header = T, sep = ",") %>%
filter(癌症別 == "胃" &
性別 == "不分男女" &
癌症診斷年 == "2015")
data.hw1 <- data.hw1[, c("縣市別", "癌症別", "癌症發生數", "粗率..每10萬人口.")]
data.hw1$n <- (data.hw1$癌症發生數/data.hw1$粗率..每10萬人口.) * 100000
data.hw1 %>% head() # dim() = 23 * 4
Above the table is the dataset
Plot the data and make exploratory data analysis (EDA). Comment on it, by observation.
# Preprocess
# matching data by countyname using "substitution"
# Cancer count
count <- sub(data.hw1[1, "縣市別"],
data.hw1[1, "癌症發生數"],
TW_shape@data$countyname)
for (i in 2:(dim(data.hw1)[1]) ){
count <- sub(data.hw1[i, "縣市別"],
data.hw1[i, "癌症發生數"],
count)
}
# Rate
rate <- sub(data.hw1[1, "縣市別"],
data.hw1[1, "粗率..每10萬人口."],
TW_shape@data$countyname)
for (i in 2:(dim(data.hw1)[1]) ){
rate <- sub(data.hw1[i, "縣市別"],
data.hw1[i, "粗率..每10萬人口."],
rate)
}
TW_shape@data$Count <- as.numeric(count)
TW_shape@data$Rate <- as.numeric(rate)
TW_shape@data$id <- row.names(TW_shape@data)
TW_mapData <- join(TW_Points, TW_shape@data, by = "id")
# visualization
g1 <- ggplot(data = TW_mapData,
aes( x = long, y = lat,
group = group, fill = Count)) +
geom_polygon() +
xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
coord_equal() +
geom_path(color = "white") +
scale_fill_gradient(low = "lawngreen", high = "red",
name = "癌症發生數") +
labs(y = "latitude", x = "longitude") +
theme(text = element_text(family = 'STHeiti')) # chinese type
g2 <- ggplot(data = TW_mapData,
aes( x = long, y = lat,
group = group, fill = Rate)) +
geom_polygon() +
xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
coord_equal() +
geom_path(color = "white") +
scale_fill_gradient(low = "lawngreen", high = "red",
name = "粗率/每十萬人") +
labs(y = "latitude", x = "longitude") +
theme(plot.margin = unit(c(-0.1, -0.1, -0.1, -0.1), "cm"),
text = element_text(family = 'STHeiti')) # chinese
ggarrange(g1, g2) %>%
annotate_figure(., top = text_grob("2015", size = 14))
上述圖形為2015年台灣各縣市胃癌的分佈圖,可看出雙北的癌症發生數最高而花東的粗率最高,會有這種差異的原因是因為,宜蘭、花東那邊人口數較少,因此癌症發生數=人口數∗粗率100000,自然也較少,而雙北即使粗率較低,但因人口數較大,因此癌症發生數也較高。由上述討論可知,由粗率來看胃癌的分佈是較合理的。
Specify your model for \(y_{j}\).
因為癌症發生數為正整數,且癌症發生數跟人口數呈比例,故 \[
\begin{align}
y_{j} \mathop{\sim}^{iid} Poisson(n_{j}*\theta_{j}),\quad j = 1,2...\ 23 \\
\end{align}
\]
Specify the prior for \(?c_{j}\)??s and justify your choice.
因\(f(y_{j})\)為exponential family,故考慮\(\pi(\theta_{j})\)取conjugate prior distribution,使posterior distribution可以有較明確的分配,且計算上較簡易。 故取 \[
\begin{align}
?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha,\ \beta),\quad j=1,2...\ 23 \\
\end{align}
\] 再來要考慮prior的參數\((\alpha, \beta)\)要如何選擇。 因為各縣市的胃癌發生數和各縣市的人口數有關,故hyper-parameter選擇整個台灣在2015年的平均胃癌發生率,即 \[
\begin{align}
E(\theta) = \frac{\alpha}{\beta} = \frac{3849}{23498168} \approx 1.638*10^{-4}
\end{align}
\] So, \[
\begin{align}
?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha = 3849,\ \beta = 23498168),\quad j=1,2...\ 23 \\
\end{align}
\]
Derive the posterior inference. In particular, provide the posterior mean and sd for the incidence rates for each counties. Summarize your results in tables or plots for a better visualization.
Proof to the posterior distribution \[
\begin{align}
\pi(\theta_{j}|y_{j})
&= \frac{f(\theta_{j}, y_{j})}{f(y_{j})} \\
&\propto f(y_{j}|\theta_{j})*\pi(\theta_{j}) \\
&\propto (\theta_{j})^{y_{j}}exp\{-n_{j}*\theta_{j}\} * (\theta_{j})^{\alpha+1} exp\{ -\beta \theta_{j}\} \\
&\propto (\theta_{j})^{y_{j} + \alpha + 1} exp\{-(n_{j} + \beta)\theta_{j} \} \\
\Rightarrow \theta_{j}|y_{j} &\mathop{\sim}^{iid}
\Gamma(\alpha^{*} = y_{j} + \alpha,\ \beta^{*} = n_{j} + \beta),\quad j = 1,2...\ 23
\end{align}
\] Get the posterior mean and sd by hyper-parameter of (c) \[
\left\{
\begin{aligned}
&E(\theta_{j}|y_{j}) = \frac{\alpha^{*}}{\beta^{*}} = \frac{y_{j} + \alpha}{n_{j} + \beta} = \frac{y_{j}+3849}{n_{j}+23498168} \\
& SD(\theta_{j}|y_{j}) = \frac{\sqrt{\alpha^{*}}}{\beta^{*}} = \frac{\sqrt{y_{j} + \alpha}}{n_{j} + \beta} = \frac{\sqrt{y_{j}+3849}}{n_{j}+23498168}
\end{aligned}
\right.
\] 可將資料帶入上述公式計算,其結果如下表所示
# Posterior mean and sd
a <- 3849
b <- 23498168
Posterior.mean <- ((data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000) %>% round(., 3)
Posterior.sd <- (sqrt(data.hw1$癌症發生數 + a)/(data.hw1$n + b) * 100000) %>% round(., 3)
cbind(Posterior.mean, Posterior.sd) %>%
`rownames<-`(c(data.hw1$縣市別 %>% as.character())) %>%
kable(., "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Posterior.mean | Posterior.sd | |
---|---|---|
台閩地區 | 16.380 | 0.187 |
台北市 | 16.624 | 0.252 |
台中市 | 16.302 | 0.249 |
台南市 | 16.219 | 0.253 |
高雄市 | 16.372 | 0.250 |
基隆市 | 16.493 | 0.263 |
新竹市 | 16.354 | 0.261 |
嘉義市 | 16.345 | 0.262 |
新北市 | 16.451 | 0.245 |
桃園市 | 16.189 | 0.251 |
新竹縣 | 16.206 | 0.260 |
宜蘭縣 | 16.488 | 0.262 |
苗栗縣 | 16.320 | 0.260 |
彰化縣 | 16.416 | 0.257 |
南投縣 | 16.357 | 0.261 |
雲林縣 | 16.402 | 0.260 |
嘉義縣 | 16.338 | 0.261 |
屏東縣 | 16.455 | 0.260 |
澎湖縣 | 16.360 | 0.263 |
花蓮縣 | 16.492 | 0.263 |
台東縣 | 16.467 | 0.263 |
金門縣 | 16.322 | 0.263 |
連江縣 | 16.388 | 0.264 |
上述表格,表後驗粗率的平均數和標準差。單位:每十萬 因為hyper-parameter選擇整個台灣在2015年的平均胃癌發生率,故可算出posterior rate(後驗粗率),並將其視覺化如下
# Visulization for bayesian analysis
data.hw1$post.rate <- (data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000
Post.Rate <- sub(data.hw1[1, "縣市別"],
data.hw1[1, "post.rate"],
TW_shape@data$countyname)
for (i in 2:(dim(data.hw1)[1]) ){
Post.Rate <- sub(data.hw1[i, "縣市別"],
data.hw1[i, "post.rate"],
Post.Rate)
}
TW_shape@data$Post.Rate <- as.numeric(Post.Rate)
TW_shape@data$id <- row.names(TW_shape@data)
TW_mapData <- join(TW_Points, TW_shape@data, by = "id")
g3 <- ggplot(data = TW_mapData,
aes( x = long, y = lat,
group = group, fill = Post.Rate)) +
geom_polygon() +
xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
coord_equal() +
geom_path(color = "white") +
scale_fill_gradient(low = "lawngreen", high = "red",
name = "Post.Rate") +
labs(y = "latitude", x = "longitude") +
theme(plot.margin = unit(c(0.3, 0.3, 0.3, 0.3), "cm"),
text = element_text(family = 'STHeiti')) # chinese type
ggarrange(g3, g2) %>%
annotate_figure(., top = text_grob("2015", size = 14))
Comments on your findings, could be on analysis, modeling ideas, or anything else.
由(d)小題的視覺化圖形可看出,原始粗率最高的縣市分佈在宜蘭、花東。經過bayesian analysis討論後;而後驗粗率最高的縣市則在台北市。但兩者相比較可看出,基本上台灣胃癌粗率最高的縣市分佈並無太大變化。C