# Installing some useful packages
library(tidyverse)
library(kableExtra)
library(maptools)
library(rgdal)
library(plyr)
library(ggpubr)    # Combine ggplot

Problem 1

(HIV diagnosis) Suppose there are two HIV test: ELISA is less expensive but less accurate; Western blot(WB) is more accurate but more expensive. The prevalence of HIV is 0.00148. Given HIV, the sensitivity and specificity of both tests are
\[ \begin{align} Sensitivity:\ &P(ELISA\ test\ +|HIV) = 0.93,\quad P(WB\ test\ + |HIV) = 0.999 \\ Specificity:\ &P(ELISA\ test\ -|no-HIV) = 0.99,\quad P(WB\ test\ -|no-HIV) = 0.991. \end{align} \] The following procedures are adopted in the US military in early years:

first screening with ELISA
if positive, 2 more rounds of ELISA
if either positive, two Western blot assays
only if both positive, determine HIV infection

Write down all possible outcomes at each stage.
\(Let\left\{\begin{aligned} &H\ be\ detect\ HIV\ + \\ &T\ be\ detect\ HIV\ - \end{aligned} \right.\quad and\ Outcomes\ Set=\{HIV, ELSA_{1}, ELSA_{2-1},ELSA_{2-2}, WB_{1-1}, WB_{1-2}\}\)
則每一步stage 的outcome如下

rbind(c("{H}, {T}"), 
      c("{H, H}, {H, T}, {T, H}, {T, T}"), 
      c("{H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T},        
        {T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T}"), 
      c("{H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T},        
        {H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T},        
        {H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T},        
        {T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T},        
        {T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T},        
        {T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T},        
        "))  %>% 
  `colnames<-`(. , "Outcomes") %>% 
  `rownames<-`(paste("Stage", seq(1,4))) %>% 
  kable(., "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	Outcomes
Stage 1	{H}, {T}
Stage 2	{H, H}, {H, T}, {T, H}, {T, T}
Stage 3	{H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T}, {T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T}
Stage 4	{H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T}, {H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T}, {H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T}, {T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T}, {T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T}, {T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T},

At j stage (j = 1, 2, 3), calculate the probability of making false suspicion: \[ \begin{align} P (no-HIV |need\ to\ continue\ for\ the\ j + 1\ stage) \end{align} \] According to the above the statement, we know \[ \begin{align} j=1,\ P(no-HIV |continue\ for\ the\ 2\ stage) &= \frac{0.99852*0.01}{0.00148*0.93+0.99852*0.01} \\ &\approx 0.8788551 \\ j=2,\ P(no-HIV |continue\ for\ the\ 3\ stage) &= \frac{0.8788551*[0.01^2+2*(0.01*0.99)]}{0.1211449*[0.93^2+2*(0.93*0.07)]+0.8788551*[0.01^2+2*(0.01*0.99)]} \\ &\approx 0.1266963 \\ j=3,\ P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\ &\approx 0.0000118 \\ \end{align} \] according to the above information, we can summarize to below table

False Discovery Rate

Stage 1 0.8788551

Stage 2 0.1266963

Stage 3 0.0000118
Calculate the false discovery rate at the final stage, i.e. P (no HIV | determine HIV infection).
According to the above the statement, we need to find \(P(no-HIV| determine\ HIV\ infection)\), and we know the probability means that \(P(no-HIV |continue\ for\ the\ 4\ stage)\). so, the probability is \[ \begin{align} P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\ &\approx 0.0000118 \\ \end{align} \]

	False Discovery Rate
Stage 1	0.8788551
Stage 2	0.1266963
Stage 3	0.0000118

Problem 2

Following the Bayesian analysis for the cancer rate example in BDA3 (section 2.7), develop a Poisson model for the cancer rate data in Taiwan (data set is provided in iLMS). You may select the data for a specific cancer in a year according to your preference.
Use the notations in your write-up:
\(y_{j}\) is the number of cancer counts in county j at a particular year
\(?c{j}\) is the underlying incidence rate for j-th county (in units: 1 per 100,000 persons per year)
\(n_{j}\) is the population of jth county

According to the above the statement, we can extract the informative column, and input the geographical information

Note: we need the informative column
Cancer type: Stomach
Gender: Regardless of men and women Diagnosis:2015

# 1. Loading geographical information
setwd("/Users/linweixiang/R/Bayesian Analysis/exercise/taiwan shape files")

TW_shape <- readShapePoly("county_WGS84.shp")

TW_shape@data$countyname <- TW_shape@data$countyname %>% 
  iconv(., "big5", "utf8") #  Convert BIG-5 to UTF-8

TW_Points <- fortify(TW_shape) # geographical information (data.frame)
# plot(TW_shape)   # plot geographical information

# 2. Loading dataset
setwd("/Users/linweixiang/R/Bayesian Analysis/Dataset/HW1")
data.hw1 <- read.table("台灣癌症數據.csv", 
                       header = T, sep = ",") %>% 
  filter(癌症別 == "胃" & 
              性別 == "不分男女" & 
              癌症診斷年 == "2015")
data.hw1 <- data.hw1[, c("縣市別", "癌症別", "癌症發生數", "粗率..每10萬人口.")]
data.hw1$n <- (data.hw1$癌症發生數/data.hw1$粗率..每10萬人口.) * 100000 
data.hw1 %>% head()  # dim() = 23 * 4

Above the table is the dataset

Plot the data and make exploratory data analysis (EDA). Comment on it, by observation.

# Preprocess
# matching data by countyname using "substitution"
# Cancer count
count <- sub(data.hw1[1, "縣市別"],    
             data.hw1[1, "癌症發生數"], 
             TW_shape@data$countyname)

for (i in 2:(dim(data.hw1)[1]) ){
  count <- sub(data.hw1[i, "縣市別"], 
               data.hw1[i, "癌症發生數"], 
               count)
}
# Rate
rate <- sub(data.hw1[1, "縣市別"],    
            data.hw1[1, "粗率..每10萬人口."], 
            TW_shape@data$countyname)

for (i in 2:(dim(data.hw1)[1]) ){
  rate <- sub(data.hw1[i, "縣市別"], 
              data.hw1[i, "粗率..每10萬人口."], 
              rate)
}


TW_shape@data$Count <- as.numeric(count)
TW_shape@data$Rate <- as.numeric(rate)
TW_shape@data$id <- row.names(TW_shape@data)

TW_mapData <- join(TW_Points, TW_shape@data, by = "id")

# visualization
g1 <- ggplot(data = TW_mapData, 
             aes( x = long, y = lat, 
                  group = group, fill = Count)) + 
  geom_polygon() + 
  xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
  coord_equal() + 
  geom_path(color = "white") +
  scale_fill_gradient(low = "lawngreen", high = "red", 
                      name = "癌症發生數") + 
  labs(y = "latitude", x = "longitude") + 
  theme(text = element_text(family = 'STHeiti')) # chinese type

g2 <- ggplot(data = TW_mapData, 
             aes( x = long, y = lat, 
                  group = group, fill = Rate)) + 
  geom_polygon() + 
  xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
  coord_equal() + 
  geom_path(color = "white") +
  scale_fill_gradient(low = "lawngreen", high = "red", 
                      name = "粗率/每十萬人") + 
  labs(y = "latitude", x = "longitude") + 
  theme(plot.margin = unit(c(-0.1, -0.1, -0.1, -0.1), "cm"), 
        text = element_text(family = 'STHeiti')) # chinese

ggarrange(g1, g2) %>% 
  annotate_figure(., top = text_grob("2015", size = 14))

上述圖形為2015年台灣各縣市胃癌的分佈圖，可看出雙北的癌症發生數最高而花東的粗率最高，會有這種差異的原因是因為，宜蘭、花東那邊人口數較少，因此癌症發生數=人口數∗粗率100000，自然也較少，而雙北即使粗率較低，但因人口數較大，因此癌症發生數也較高。由上述討論可知，由粗率來看胃癌的分佈是較合理的。

Specify your model for \(y_{j}\).
因為癌症發生數為正整數，且癌症發生數跟人口數呈比例，故 \[ \begin{align} y_{j} \mathop{\sim}^{iid} Poisson(n_{j}*\theta_{j}),\quad j = 1,2...\ 23 \\ \end{align} \]
Specify the prior for \(?c_{j}\)??s and justify your choice.
因\(f(y_{j})\)為exponential family，故考慮\(\pi(\theta_{j})\)取conjugate prior distribution，使posterior distribution可以有較明確的分配，且計算上較簡易。故取 \[ \begin{align} ?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha,\ \beta),\quad j=1,2...\ 23 \\ \end{align} \] 再來要考慮prior的參數\((\alpha, \beta)\)要如何選擇。因為各縣市的胃癌發生數和各縣市的人口數有關，故hyper-parameter選擇整個台灣在2015年的平均胃癌發生率，即 \[ \begin{align} E(\theta) = \frac{\alpha}{\beta} = \frac{3849}{23498168} \approx 1.638*10^{-4} \end{align} \] So, \[ \begin{align} ?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha = 3849,\ \beta = 23498168),\quad j=1,2...\ 23 \\ \end{align} \]

Derive the posterior inference. In particular, provide the posterior mean and sd for the incidence rates for each counties. Summarize your results in tables or plots for a better visualization.
Proof to the posterior distribution \[ \begin{align} \pi(\theta_{j}|y_{j}) &= \frac{f(\theta_{j}, y_{j})}{f(y_{j})} \\ &\propto f(y_{j}|\theta_{j})*\pi(\theta_{j}) \\ &\propto (\theta_{j})^{y_{j}}exp\{-n_{j}*\theta_{j}\} * (\theta_{j})^{\alpha+1} exp\{ -\beta \theta_{j}\} \\ &\propto (\theta_{j})^{y_{j} + \alpha + 1} exp\{-(n_{j} + \beta)\theta_{j} \} \\ \Rightarrow \theta_{j}|y_{j} &\mathop{\sim}^{iid} \Gamma(\alpha^{*} = y_{j} + \alpha,\ \beta^{*} = n_{j} + \beta),\quad j = 1,2...\ 23 \end{align} \] Get the posterior mean and sd by hyper-parameter of (c) \[ \left\{ \begin{aligned} &E(\theta_{j}|y_{j}) = \frac{\alpha^{*}}{\beta^{*}} = \frac{y_{j} + \alpha}{n_{j} + \beta} = \frac{y_{j}+3849}{n_{j}+23498168} \\ & SD(\theta_{j}|y_{j}) = \frac{\sqrt{\alpha^{*}}}{\beta^{*}} = \frac{\sqrt{y_{j} + \alpha}}{n_{j} + \beta} = \frac{\sqrt{y_{j}+3849}}{n_{j}+23498168} \end{aligned} \right. \] 可將資料帶入上述公式計算，其結果如下表所示

# Posterior mean and sd
a <- 3849
b <- 23498168
Posterior.mean <- ((data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000) %>% round(., 3)
Posterior.sd <- (sqrt(data.hw1$癌症發生數 + a)/(data.hw1$n + b) * 100000) %>% round(., 3)
cbind(Posterior.mean, Posterior.sd) %>% 
  `rownames<-`(c(data.hw1$縣市別 %>% as.character())) %>% 
  kable(., "html") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

	Posterior.mean	Posterior.sd
台閩地區	16.380	0.187
台北市	16.624	0.252
台中市	16.302	0.249
台南市	16.219	0.253
高雄市	16.372	0.250
基隆市	16.493	0.263
新竹市	16.354	0.261
嘉義市	16.345	0.262
新北市	16.451	0.245
桃園市	16.189	0.251
新竹縣	16.206	0.260
宜蘭縣	16.488	0.262
苗栗縣	16.320	0.260
彰化縣	16.416	0.257
南投縣	16.357	0.261
雲林縣	16.402	0.260
嘉義縣	16.338	0.261
屏東縣	16.455	0.260
澎湖縣	16.360	0.263
花蓮縣	16.492	0.263
台東縣	16.467	0.263
金門縣	16.322	0.263
連江縣	16.388	0.264

上述表格，表後驗粗率的平均數和標準差。單位:每十萬因為hyper-parameter選擇整個台灣在2015年的平均胃癌發生率，故可算出posterior rate(後驗粗率)，並將其視覺化如下

# Visulization for bayesian analysis
data.hw1$post.rate <- (data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000

Post.Rate <- sub(data.hw1[1, "縣市別"],    
                 data.hw1[1, "post.rate"], 
                 TW_shape@data$countyname)

for (i in 2:(dim(data.hw1)[1]) ){
  Post.Rate <- sub(data.hw1[i, "縣市別"], 
                   data.hw1[i, "post.rate"], 
                   Post.Rate)
}

TW_shape@data$Post.Rate <- as.numeric(Post.Rate)
TW_shape@data$id <- row.names(TW_shape@data)

TW_mapData <- join(TW_Points, TW_shape@data, by = "id") 

g3 <- ggplot(data = TW_mapData, 
             aes( x = long, y = lat, 
                  group = group, fill = Post.Rate)) + 
  geom_polygon() + 
  xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
  coord_equal() + 
  geom_path(color = "white") +
  scale_fill_gradient(low = "lawngreen", high = "red", 
                      name = "Post.Rate") + 
  labs(y = "latitude", x = "longitude") + 
  theme(plot.margin = unit(c(0.3, 0.3, 0.3, 0.3), "cm"), 
        text = element_text(family = 'STHeiti')) # chinese type
ggarrange(g3, g2) %>% 
  annotate_figure(., top = text_grob("2015", size = 14))

Comments on your findings, could be on analysis, modeling ideas, or anything else.
由(d)小題的視覺化圖形可看出，原始粗率最高的縣市分佈在宜蘭、花東。經過bayesian analysis討論後;而後驗粗率最高的縣市則在台北市。但兩者相比較可看出，基本上台灣胃癌粗率最高的縣市分佈並無太大變化。C

Bayesian Analysis_HW1

0652626_林暐翔

2019/3/9

Problem 1

Problem 2