# Installing some useful packages
library(tidyverse)
library(kableExtra)
library(maptools)
library(rgdal)
library(plyr)
library(ggpubr)    # Combine ggplot

Problem 1

(HIV diagnosis) Suppose there are two HIV test: ELISA is less expensive but less accurate; Western blot(WB) is more accurate but more expensive. The prevalence of HIV is 0.00148. Given HIV, the sensitivity and specificity of both tests are
\[ \begin{align} Sensitivity:\ &P(ELISA\ test\ +|HIV) = 0.93,\quad P(WB\ test\ + |HIV) = 0.999 \\ Specificity:\ &P(ELISA\ test\ -|no-HIV) = 0.99,\quad P(WB\ test\ -|no-HIV) = 0.991. \end{align} \] The following procedures are adopted in the US military in early years:

  1. Write down all possible outcomes at each stage.
    \(Let\left\{\begin{aligned} &H\ be\ detect\ HIV\ + \\ &T\ be\ detect\ HIV\ - \end{aligned} \right.\quad and\ Outcomes\ Set=\{HIV, ELSA_{1}, ELSA_{2-1},ELSA_{2-2}, WB_{1-1}, WB_{1-2}\}\)
    則每一步stage 的outcome如下

    rbind(c("{H}, {T}"), 
          c("{H, H}, {H, T}, {T, H}, {T, T}"), 
          c("{H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T},        
            {T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T}"), 
          c("{H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T},        
            {H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T},        
            {H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T},        
            {T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T},        
            {T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T},        
            {T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T},        
            "))  %>% 
      `colnames<-`(. , "Outcomes") %>% 
      `rownames<-`(paste("Stage", seq(1,4))) %>% 
      kable(., "html") %>%
      kable_styling(bootstrap_options = "striped", full_width = F)
    Outcomes
    Stage 1 {H}, {T}
    Stage 2 {H, H}, {H, T}, {T, H}, {T, T}
    Stage 3 {H, H, H, H}, {H, H, H, T}, {H, H, T, H}, {H, H, T, T},
    {T, H, H, H}, {T, H, H, T}, {T, H, T, H}, {T, H, T, T}
    Stage 4 {H, H, H, H, H, H}, {H, H, H, H, T, H}, {H, H, H, H, H, T}, {H, H, H, H, T, T},
    {H, H, H, T, H, H}, {H, H, H, T, T, H}, {H, H, H, T, H, T}, {H, H, H, T, T, T},
    {H, H, T, H, H, H}, {H, H, T, H, T, H}, {H, H, T, H, H, T}, {H, H, T, H, T, T},
    {T, H, H, H, H, H}, {T, H, H, H, T, H}, {T, H, H, H, H, T}, {T, H, H, H, T, T},
    {T, H, H, T, H, H}, {T, H, H, T, T, H}, {T, H, H, T, H, T}, {T, H, H, T, T, T},
    {T, H, T, H, H, H}, {T, H, T, H, T, H}, {T, H, T, H, H, T}, {T, H, T, H, T, T},
  2. At j stage (j = 1, 2, 3), calculate the probability of making false suspicion: \[ \begin{align} P (no-HIV |need\ to\ continue\ for\ the\ j + 1\ stage) \end{align} \] According to the above the statement, we know \[ \begin{align} j=1,\ P(no-HIV |continue\ for\ the\ 2\ stage) &= \frac{0.99852*0.01}{0.00148*0.93+0.99852*0.01} \\ &\approx 0.8788551 \\ j=2,\ P(no-HIV |continue\ for\ the\ 3\ stage) &= \frac{0.8788551*[0.01^2+2*(0.01*0.99)]}{0.1211449*[0.93^2+2*(0.93*0.07)]+0.8788551*[0.01^2+2*(0.01*0.99)]} \\ &\approx 0.1266963 \\ j=3,\ P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\ &\approx 0.0000118 \\ \end{align} \] according to the above information, we can summarize to below table

    False Discovery Rate
    Stage 1 0.8788551
    Stage 2 0.1266963
    Stage 3 0.0000118
  3. Calculate the false discovery rate at the final stage, i.e. P (no HIV | determine HIV infection).
    According to the above the statement, we need to find \(P(no-HIV| determine\ HIV\ infection)\), and we know the probability means that \(P(no-HIV |continue\ for\ the\ 4\ stage)\). so, the probability is \[ \begin{align} P(no-HIV |continue\ for\ the\ 4\ stage) &= \frac{0.1266963*0.009^2}{0.8733037*0.999^2+0.1266963*0.009^2} \\ &\approx 0.0000118 \\ \end{align} \]

Problem 2

Following the Bayesian analysis for the cancer rate example in BDA3 (section 2.7), develop a Poisson model for the cancer rate data in Taiwan (data set is provided in iLMS). You may select the data for a specific cancer in a year according to your preference.
Use the notations in your write-up:
\(y_{j}\) is the number of cancer counts in county j at a particular year
\(?c{j}\) is the underlying incidence rate for j-th county (in units: 1 per 100,000 persons per year)
\(n_{j}\) is the population of jth county

According to the above the statement, we can extract the informative column, and input the geographical information

Note: we need the informative column
Cancer type: Stomach
Gender: Regardless of men and women Diagnosis:2015

# 1. Loading geographical information
setwd("/Users/linweixiang/R/Bayesian Analysis/exercise/taiwan shape files")

TW_shape <- readShapePoly("county_WGS84.shp")

TW_shape@data$countyname <- TW_shape@data$countyname %>% 
  iconv(., "big5", "utf8") #  Convert BIG-5 to UTF-8

TW_Points <- fortify(TW_shape) # geographical information (data.frame)
# plot(TW_shape)   # plot geographical information
# 2. Loading dataset
setwd("/Users/linweixiang/R/Bayesian Analysis/Dataset/HW1")
data.hw1 <- read.table("台灣癌症數據.csv", 
                       header = T, sep = ",") %>% 
  filter(癌症別 == "胃" & 
              性別 == "不分男女" & 
              癌症診斷年 == "2015")
data.hw1 <- data.hw1[, c("縣市別", "癌症別", "癌症發生數", "粗率..每10萬人口.")]
data.hw1$n <- (data.hw1$癌症發生數/data.hw1$粗率..每10萬人口.) * 100000 
data.hw1 %>% head()  # dim() = 23 * 4

Above the table is the dataset

  1. Plot the data and make exploratory data analysis (EDA). Comment on it, by observation.

    # Preprocess
    # matching data by countyname using "substitution"
    # Cancer count
    count <- sub(data.hw1[1, "縣市別"],    
                 data.hw1[1, "癌症發生數"], 
                 TW_shape@data$countyname)
    
    for (i in 2:(dim(data.hw1)[1]) ){
      count <- sub(data.hw1[i, "縣市別"], 
                   data.hw1[i, "癌症發生數"], 
                   count)
    }
    # Rate
    rate <- sub(data.hw1[1, "縣市別"],    
                data.hw1[1, "粗率..每10萬人口."], 
                TW_shape@data$countyname)
    
    for (i in 2:(dim(data.hw1)[1]) ){
      rate <- sub(data.hw1[i, "縣市別"], 
                  data.hw1[i, "粗率..每10萬人口."], 
                  rate)
    }
    
    
    TW_shape@data$Count <- as.numeric(count)
    TW_shape@data$Rate <- as.numeric(rate)
    TW_shape@data$id <- row.names(TW_shape@data)
    
    TW_mapData <- join(TW_Points, TW_shape@data, by = "id") 
    # visualization
    g1 <- ggplot(data = TW_mapData, 
                 aes( x = long, y = lat, 
                      group = group, fill = Count)) + 
      geom_polygon() + 
      xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
      coord_equal() + 
      geom_path(color = "white") +
      scale_fill_gradient(low = "lawngreen", high = "red", 
                          name = "癌症發生數") + 
      labs(y = "latitude", x = "longitude") + 
      theme(text = element_text(family = 'STHeiti')) # chinese type
    
    g2 <- ggplot(data = TW_mapData, 
                 aes( x = long, y = lat, 
                      group = group, fill = Rate)) + 
      geom_polygon() + 
      xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
      coord_equal() + 
      geom_path(color = "white") +
      scale_fill_gradient(low = "lawngreen", high = "red", 
                          name = "粗率/每十萬人") + 
      labs(y = "latitude", x = "longitude") + 
      theme(plot.margin = unit(c(-0.1, -0.1, -0.1, -0.1), "cm"), 
            text = element_text(family = 'STHeiti')) # chinese
    
    ggarrange(g1, g2) %>% 
      annotate_figure(., top = text_grob("2015", size = 14)) 

    上述圖形為2015年台灣各縣市胃癌的分佈圖,可看出雙北的癌症發生數最高而花東的粗率最高,會有這種差異的原因是因為,宜蘭、花東那邊人口數較少,因此癌症發生數=人口數∗粗率100000,自然也較少,而雙北即使粗率較低,但因人口數較大,因此癌症發生數也較高。由上述討論可知,由粗率來看胃癌的分佈是較合理的。

  2. Specify your model for \(y_{j}\).
    因為癌症發生數為正整數,且癌症發生數跟人口數呈比例,故 \[ \begin{align} y_{j} \mathop{\sim}^{iid} Poisson(n_{j}*\theta_{j}),\quad j = 1,2...\ 23 \\ \end{align} \]

  3. Specify the prior for \(?c_{j}\)??s and justify your choice.
    \(f(y_{j})\)為exponential family,故考慮\(\pi(\theta_{j})\)取conjugate prior distribution,使posterior distribution可以有較明確的分配,且計算上較簡易。 故取 \[ \begin{align} ?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha,\ \beta),\quad j=1,2...\ 23 \\ \end{align} \] 再來要考慮prior的參數\((\alpha, \beta)\)要如何選擇。 因為各縣市的胃癌發生數和各縣市的人口數有關,故hyper-parameter選擇整個台灣在2015年的平均胃癌發生率,即 \[ \begin{align} E(\theta) = \frac{\alpha}{\beta} = \frac{3849}{23498168} \approx 1.638*10^{-4} \end{align} \] So, \[ \begin{align} ?c_{j} \mathop{\sim}^{iid} \Gamma(\alpha = 3849,\ \beta = 23498168),\quad j=1,2...\ 23 \\ \end{align} \]

  4. Derive the posterior inference. In particular, provide the posterior mean and sd for the incidence rates for each counties. Summarize your results in tables or plots for a better visualization.
    Proof to the posterior distribution \[ \begin{align} \pi(\theta_{j}|y_{j}) &= \frac{f(\theta_{j}, y_{j})}{f(y_{j})} \\ &\propto f(y_{j}|\theta_{j})*\pi(\theta_{j}) \\ &\propto (\theta_{j})^{y_{j}}exp\{-n_{j}*\theta_{j}\} * (\theta_{j})^{\alpha+1} exp\{ -\beta \theta_{j}\} \\ &\propto (\theta_{j})^{y_{j} + \alpha + 1} exp\{-(n_{j} + \beta)\theta_{j} \} \\ \Rightarrow \theta_{j}|y_{j} &\mathop{\sim}^{iid} \Gamma(\alpha^{*} = y_{j} + \alpha,\ \beta^{*} = n_{j} + \beta),\quad j = 1,2...\ 23 \end{align} \] Get the posterior mean and sd by hyper-parameter of (c) \[ \left\{ \begin{aligned} &E(\theta_{j}|y_{j}) = \frac{\alpha^{*}}{\beta^{*}} = \frac{y_{j} + \alpha}{n_{j} + \beta} = \frac{y_{j}+3849}{n_{j}+23498168} \\ & SD(\theta_{j}|y_{j}) = \frac{\sqrt{\alpha^{*}}}{\beta^{*}} = \frac{\sqrt{y_{j} + \alpha}}{n_{j} + \beta} = \frac{\sqrt{y_{j}+3849}}{n_{j}+23498168} \end{aligned} \right. \] 可將資料帶入上述公式計算,其結果如下表所示

    # Posterior mean and sd
    a <- 3849
    b <- 23498168
    Posterior.mean <- ((data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000) %>% round(., 3)
    Posterior.sd <- (sqrt(data.hw1$癌症發生數 + a)/(data.hw1$n + b) * 100000) %>% round(., 3)
    cbind(Posterior.mean, Posterior.sd) %>% 
      `rownames<-`(c(data.hw1$縣市別 %>% as.character())) %>% 
      kable(., "html") %>% 
      kable_styling(bootstrap_options = "striped", full_width = F)
    Posterior.mean Posterior.sd
    台閩地區 16.380 0.187
    台北市 16.624 0.252
    台中市 16.302 0.249
    台南市 16.219 0.253
    高雄市 16.372 0.250
    基隆市 16.493 0.263
    新竹市 16.354 0.261
    嘉義市 16.345 0.262
    新北市 16.451 0.245
    桃園市 16.189 0.251
    新竹縣 16.206 0.260
    宜蘭縣 16.488 0.262
    苗栗縣 16.320 0.260
    彰化縣 16.416 0.257
    南投縣 16.357 0.261
    雲林縣 16.402 0.260
    嘉義縣 16.338 0.261
    屏東縣 16.455 0.260
    澎湖縣 16.360 0.263
    花蓮縣 16.492 0.263
    台東縣 16.467 0.263
    金門縣 16.322 0.263
    連江縣 16.388 0.264

    上述表格,表後驗粗率的平均數和標準差。單位:每十萬 因為hyper-parameter選擇整個台灣在2015年的平均胃癌發生率,故可算出posterior rate(後驗粗率),並將其視覺化如下

    # Visulization for bayesian analysis
    data.hw1$post.rate <- (data.hw1$癌症發生數 + a) / (data.hw1$n + b) * 100000
    
    Post.Rate <- sub(data.hw1[1, "縣市別"],    
                     data.hw1[1, "post.rate"], 
                     TW_shape@data$countyname)
    
    for (i in 2:(dim(data.hw1)[1]) ){
      Post.Rate <- sub(data.hw1[i, "縣市別"], 
                       data.hw1[i, "post.rate"], 
                       Post.Rate)
    }
    
    TW_shape@data$Post.Rate <- as.numeric(Post.Rate)
    TW_shape@data$id <- row.names(TW_shape@data)
    
    TW_mapData <- join(TW_Points, TW_shape@data, by = "id") 
    
    g3 <- ggplot(data = TW_mapData, 
                 aes( x = long, y = lat, 
                      group = group, fill = Post.Rate)) + 
      geom_polygon() + 
      xlim(c(120, 122)) + ylim(c(21.9, 25.3)) +
      coord_equal() + 
      geom_path(color = "white") +
      scale_fill_gradient(low = "lawngreen", high = "red", 
                          name = "Post.Rate") + 
      labs(y = "latitude", x = "longitude") + 
      theme(plot.margin = unit(c(0.3, 0.3, 0.3, 0.3), "cm"), 
            text = element_text(family = 'STHeiti')) # chinese type
    ggarrange(g3, g2) %>% 
      annotate_figure(., top = text_grob("2015", size = 14)) 

  5. Comments on your findings, could be on analysis, modeling ideas, or anything else.
    由(d)小題的視覺化圖形可看出,原始粗率最高的縣市分佈在宜蘭、花東。經過bayesian analysis討論後;而後驗粗率最高的縣市則在台北市。但兩者相比較可看出,基本上台灣胃癌粗率最高的縣市分佈並無太大變化。C