COVID-19 存活情況預測

介紹

分析目的

分析的動機：其實是因為準備Data Mining的期中報告找到這個資料集，但正好可以利用這個資料集來統整和回顧一下之前學到的一些知識點。

資料分析目的：透過患者的一些基礎信息來預測他的COVID-19中的存活情況，有三種類別（deceased isolated released）,我們想對這三種類別進行預測分類，並可進行進一步的分析。

分析過程

Data Preprocessing
- Data cleaning: missing values, smooth noisy data, identify or remove outliers, and resolve inconsistencies
- Data integration: Integration of multiple databases, data cubes, or files
- Data transformation: Normalization and aggregation
- Data reduction: Obtains reduced representation in volume but produces the same or similar analytical results
- Data discretization: Part of data reduction but with particular importance, especially for numerical data
用 caret包建立三個模型，調整模型的空值變量（參數）
- KNN
- DT
- GBM

Data Preprocessing

載入資料以及需要的packages

setwd("/Users/a1234/Desktop/SCHOOL/碩班第二學期/Data mining/期中/archive")
caseData <- read.csv("Case.csv", header = T, stringsAsFactors = F)  
patientInfoData <- read.csv("PatientInfo.csv", header = T, stringsAsFactors = F)      # 個人資料 paper2 只用到了這一個資料集     
regionData <- read.csv("Region.csv", header = T, stringsAsFactors = F) 
str(patientInfoData)

## 'data.frame':    5165 obs. of  14 variables:
##  $ patient_id        : num  1e+09 1e+09 1e+09 1e+09 1e+09 ...
##  $ sex               : chr  "male" "male" "male" "male" ...
##  $ age               : chr  "50s" "30s" "50s" "20s" ...
##  $ country           : chr  "Korea" "Korea" "Korea" "Korea" ...
##  $ province          : chr  "Seoul" "Seoul" "Seoul" "Seoul" ...
##  $ city              : chr  "Gangseo-gu" "Jungnang-gu" "Jongno-gu" "Mapo-gu" ...
##  $ infection_case    : chr  "overseas inflow" "overseas inflow" "contact with patient" "overseas inflow" ...
##  $ infected_by       : chr  "" "" "2002000001" "" ...
##  $ contact_number    : chr  "75" "31" "17" "9" ...
##  $ symptom_onset_date: chr  "2020-01-22" "" "" "2020-01-26" ...
##  $ confirmed_date    : chr  "2020-01-23" "2020-01-30" "2020-01-30" "2020-01-30" ...
##  $ released_date     : chr  "2020-02-05" "2020-03-02" "2020-02-19" "2020-02-15" ...
##  $ deceased_date     : chr  "" "" "" "" ...
##  $ state             : chr  "released" "released" "released" "released" ...

table(patientInfoData$state)

## 
## deceased isolated released 
##       78     2158     2929

# 載入 packages
library('data.table') # input data
library('ggplot2') # visualization
library('ggthemes') # visualization
library('scales') # visualization
library("GGally") # plot data correlation
library('dplyr') # data manipulation
library('tidyr')

library('caret') # train model
library('randomForest') # classification algorithm
library('mice') # imputation
library('pROC') # ROC curves

# 找出韓國全部的案例，並把"patient_id","released_date","deceased_date"刪除
# 因為"released_date","deceased_date"直接就可以反應出state的狀況，所以肯定是要去掉的
patientInfoKorea <- patientInfoData %>%
  filter(country =="Korea")%>%
  select(-c("patient_id","country","infected_by","released_date","deceased_date"))
#  mutate(deceased_date = as.Date(deceased_date,format="%Y-%m-%d")) # 設定好date轉換的格式，不然後面會報錯

# sapply(patientInfoKorea,table)
str(patientInfoKorea)

## 'data.frame':    5123 obs. of  9 variables:
##  $ sex               : chr  "male" "male" "male" "male" ...
##  $ age               : chr  "50s" "30s" "50s" "20s" ...
##  $ province          : chr  "Seoul" "Seoul" "Seoul" "Seoul" ...
##  $ city              : chr  "Gangseo-gu" "Jungnang-gu" "Jongno-gu" "Mapo-gu" ...
##  $ infection_case    : chr  "overseas inflow" "overseas inflow" "contact with patient" "overseas inflow" ...
##  $ contact_number    : chr  "75" "31" "17" "9" ...
##  $ symptom_onset_date: chr  "2020-01-22" "" "" "2020-01-26" ...
##  $ confirmed_date    : chr  "2020-01-23" "2020-01-30" "2020-01-30" "2020-01-30" ...
##  $ state             : chr  "released" "released" "released" "released" ...

apply(patientInfoKorea,2,function(x){sum(is.na(x))})

##                sex                age           province               city 
##                  0                  0                  0                  0 
##     infection_case     contact_number symptom_onset_date     confirmed_date 
##                  0                  0                  0                  0 
##              state 
##                  0

# 簡單的數據類型的轉換
factorData <- patientInfoKorea %>%
  select(sex:infection_case,state)%>%
  mutate_all(as.factor)

factorData[factorData==""] <- NA # 把factorData所有的空值都用 NA 代替（終於找到了非常簡潔的寫法）

dateData <- patientInfoKorea %>%
  select(contact_number:confirmed_date)%>%
  mutate(contact_number = as.numeric(contact_number))%>%
  mutate_if(is.character,as.Date)

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

patInfoKorea <- cbind(factorData,dateData)

str(patInfoKorea)

## 'data.frame':    5123 obs. of  9 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 3 3 3 2 2 3 3 3 2 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 8 6 8 5 5 8 5 5 6 9 ...
##  $ province          : Factor w/ 17 levels "Busan","Chungcheongbuk-do",..: 16 16 16 16 16 16 16 16 16 16 ...
##  $ city              : Factor w/ 163 levels "","Andong-si",..: 42 96 94 98 127 94 94 34 134 127 ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 35 7 35 7 7 7 35 35 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ contact_number    : num  75 31 17 9 2 43 0 0 68 6 ...
##  $ symptom_onset_date: Date, format: "2020-01-22" NA ...
##  $ confirmed_date    : Date, format: "2020-01-23" "2020-01-30" ...

sum(is.na(patInfoKorea))

## [1] 12294

Data cleaning

Missing values
Smooth noisy data, identify or remove outliers
Resolve inconsistencies

Missing values

原則：盡可能多的保留feature，因為本身就只用了一個資料集

# 查看看丟失數據，各個欄位NA的比例
apply(patInfoKorea,2,function(x){sum(is.na(x))})

##                sex                age           province               city 
##               1118               1376                  0                 93 
##     infection_case              state     contact_number symptom_onset_date 
##                913                  0               4354               4437 
##     confirmed_date 
##                  3

ratio <- apply(patInfoKorea,2,function(x){sum(is.na(x))/nrow(patientInfoData)}) 

Table <-as.data.frame(ratio)

ggplot(Table, aes(reorder(rownames(Table), +ratio),ratio))+ 
  geom_col()+ 
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=30, hjust=1))

ggplot(Table,aes(x = rownames(Table), y = ratio))+
  geom_col()+ 
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=30, hjust=1))

各個欄位NA值的處理方式：

sex：mice
age：mice
city：mice
infection_case：etc
contact_number：缺失值太多（>50%）刪除，因為可能與本身的存活率關係不大
symptom_onset_date（有症狀的日期）：缺失值太多（>50%），有日期的給1，NA值給0
confirmed_date：3 直接刪掉空缺的rows

NA處理方式的補充資料：

非常適合用來直觀表示切分欄位的機率密度圖👇： https://blog.csdn.net/weixin_45387324/article/details/109408376

用mice填充 missing data 的知識點 http://blog.fens.me/r-na-mice/

多圖ggdraw() ：https://www.jianshu.com/p/c154ca35530b

時間序列的相關性：https://rpubs.com/ivan0628/TimeSeries01

而R在處理遺漏值有五大厲害套件：

MICE
Amelia
missForest
Hmisc
mi

# 時間序列的分析
library("tidyverse")

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ tibble  3.1.0     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()        masks data.table::between()
## x readr::col_factor()     masks scales::col_factor()
## x randomForest::combine() masks dplyr::combine()
## x purrr::discard()        masks scales::discard()
## x mice::filter()          masks dplyr::filter(), stats::filter()
## x dplyr::first()          masks data.table::first()
## x dplyr::lag()            masks stats::lag()
## x dplyr::last()           masks data.table::last()
## x purrr::lift()           masks caret::lift()
## x randomForest::margin()  masks ggplot2::margin()
## x purrr::transpose()      masks data.table::transpose()

library("forecast") # ggAcf()

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library("cowplot") # ggdraw()  多圖

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggthemes':
## 
##     theme_map

ggAcf(patInfoKorea$confirmed_date) +
  ggtitle('confirmed_date')

ggAcf(patInfoKorea$symptom_onset_date) +
  ggtitle('symptom_onset_date')

# mice 分析
md.pattern(patInfoKorea, rotate.names = T)

##      province state confirmed_date city infection_case  sex  age contact_number
## 248         1     1              1    1              1    1    1              1
## 424         1     1              1    1              1    1    1              1
## 282         1     1              1    1              1    1    1              0
## 1913        1     1              1    1              1    1    1              0
## 4           1     1              1    1              1    1    0              1
## 250         1     1              1    1              1    1    0              0
## 1           1     1              1    1              1    0    1              1
## 1           1     1              1    1              1    0    1              0
## 1           1     1              1    1              1    0    1              0
## 8           1     1              1    1              1    0    0              1
## 130         1     1              1    1              1    0    0              0
## 890         1     1              1    1              1    0    0              0
## 5           1     1              1    1              0    1    1              1
## 51          1     1              1    1              0    1    1              1
## 8           1     1              1    1              0    1    1              0
## 722         1     1              1    1              0    1    1              0
## 5           1     1              1    1              0    1    0              0
## 1           1     1              1    1              0    0    0              1
## 6           1     1              1    1              0    0    0              0
## 77          1     1              1    1              0    0    0              0
## 5           1     1              1    0              1    1    1              1
## 22          1     1              1    0              1    1    1              1
## 26          1     1              1    0              1    1    1              0
## 2           1     1              1    0              1    1    0              0
## 35          1     1              1    0              0    1    1              0
## 3           1     1              1    0              0    0    0              0
## 1           1     1              0    1              1    1    1              0
## 2           1     1              0    1              1    1    1              0
##             0     0              3   93            913 1118 1376           4354
##      symptom_onset_date      
## 248                   1     0
## 424                   0     1
## 282                   1     1
## 1913                  0     2
## 4                     0     2
## 250                   0     3
## 1                     0     2
## 1                     1     2
## 1                     0     3
## 8                     0     3
## 130                   1     3
## 890                   0     4
## 5                     1     1
## 51                    0     2
## 8                     1     2
## 722                   0     3
## 5                     0     4
## 1                     0     4
## 6                     1     4
## 77                    0     5
## 5                     1     1
## 22                    0     2
## 26                    0     3
## 2                     0     4
## 35                    0     4
## 3                     0     6
## 1                     1     2
## 2                     0     3
##                    4437 12294

md.pairs(patInfoKorea)

## $rr
##                     sex  age province city infection_case state contact_number
## sex                4005 3744     4005 3915           3179  4005            759
## age                3744 3747     3747 3659           2926  3747            756
## province           4005 3747     5123 5030           4210  5123            769
## city               3915 3659     5030 5030           4155  5030            742
## infection_case     3179 2926     4210 4155           4210  4210            712
## state              4005 3747     5123 5030           4210  5123            769
## contact_number      759  756      769  742            712   769            769
## symptom_onset_date  549  550      686  681            667   686            258
## confirmed_date     4002 3744     5120 5027           4207  5120            769
##                    symptom_onset_date confirmed_date
## sex                               549           4002
## age                               550           3744
## province                          686           5120
## city                              681           5027
## infection_case                    667           4207
## state                             686           5120
## contact_number                    258            769
## symptom_onset_date                686            685
## confirmed_date                    685           5120
## 
## $rm
##                     sex  age province city infection_case state contact_number
## sex                   0  261        0   90            826     0           3246
## age                   3    0        0   88            821     0           2991
## province           1118 1376        0   93            913     0           4354
## city               1115 1371        0    0            875     0           4288
## infection_case     1031 1284        0   55              0     0           3498
## state              1118 1376        0   93            913     0           4354
## contact_number       10   13        0   27             57     0              0
## symptom_onset_date  137  136        0    5             19     0            428
## confirmed_date     1118 1376        0   93            913     0           4351
##                    symptom_onset_date confirmed_date
## sex                              3456              3
## age                              3197              3
## province                         4437              3
## city                             4349              3
## infection_case                   3543              3
## state                            4437              3
## contact_number                    511              0
## symptom_onset_date                  0              1
## confirmed_date                   4435              0
## 
## $mr
##                     sex  age province city infection_case state contact_number
## sex                   0    3     1118 1115           1031  1118             10
## age                 261    0     1376 1371           1284  1376             13
## province              0    0        0    0              0     0              0
## city                 90   88       93    0             55    93             27
## infection_case      826  821      913  875              0   913             57
## state                 0    0        0    0              0     0              0
## contact_number     3246 2991     4354 4288           3498  4354              0
## symptom_onset_date 3456 3197     4437 4349           3543  4437            511
## confirmed_date        3    3        3    3              3     3              0
##                    symptom_onset_date confirmed_date
## sex                               137           1118
## age                               136           1376
## province                            0              0
## city                                5             93
## infection_case                     19            913
## state                               0              0
## contact_number                    428           4351
## symptom_onset_date                  0           4435
## confirmed_date                      1              0
## 
## $mm
##                     sex  age province city infection_case state contact_number
## sex                1118 1115        0    3             87     0           1108
## age                1115 1376        0    5             92     0           1363
## province              0    0        0    0              0     0              0
## city                  3    5        0   93             38     0             66
## infection_case       87   92        0   38            913     0            856
## state                 0    0        0    0              0     0              0
## contact_number     1108 1363        0   66            856     0           4354
## symptom_onset_date  981 1240        0   88            894     0           3926
## confirmed_date        0    0        0    0              0     0              3
##                    symptom_onset_date confirmed_date
## sex                               981              0
## age                              1240              0
## province                            0              0
## city                               88              0
## infection_case                    894              0
## state                               0              0
## contact_number                   3926              3
## symptom_onset_date               4437              2
## confirmed_date                      2              3

# 選取 "sex","age","city"
patInfoKoreaMICE <- patInfoKorea %>%
  select(c("sex","age","city"))
patInfoKoreaOther <- patInfoKorea %>%
  select(-c("sex","age","city"))

# mice 填補 "sex","age","city"
mice_patInfoKorea <- mice(patInfoKoreaMICE,
                  m = 3,           # 產生三個被填補好的資料表
                  maxit = 10,      # max iteration
                  method = "cart", # 使用CART決策樹，進行遺漏值預測
                  seed = 100)      # set.seed()，令抽樣每次都一樣

## 
##  iter imp variable
##   1   1  sex  age  city
##   1   2  sex  age  city
##   1   3  sex  age  city
##   2   1  sex  age  city
##   2   2  sex  age  city
##   2   3  sex  age  city
##   3   1  sex  age  city
##   3   2  sex  age  city
##   3   3  sex  age  city
##   4   1  sex  age  city
##   4   2  sex  age  city
##   4   3  sex  age  city
##   5   1  sex  age  city
##   5   2  sex  age  city
##   5   3  sex  age  city
##   6   1  sex  age  city
##   6   2  sex  age  city
##   6   3  sex  age  city
##   7   1  sex  age  city
##   7   2  sex  age  city
##   7   3  sex  age  city
##   8   1  sex  age  city
##   8   2  sex  age  city
##   8   3  sex  age  city
##   9   1  sex  age  city
##   9   2  sex  age  city
##   9   3  sex  age  city
##   10   1  sex  age  city
##   10   2  sex  age  city
##   10   3  sex  age  city

## Warning: Number of logged events: 90

# 填補好的資料：因為m=3，所以會有三個填補好的資料集，可以用以下方式取出
head(complete(mice_patInfoKorea, 1)) # 1st data

##      sex age        city
## 1   male 50s  Gangseo-gu
## 2   male 30s Jungnang-gu
## 3   male 50s   Jongno-gu
## 4   male 20s     Mapo-gu
## 5 female 20s Seongbuk-gu
## 6 female 50s   Jongno-gu

head(complete(mice_patInfoKorea, 2))# 2nd data

##      sex age        city
## 1   male 50s  Gangseo-gu
## 2   male 30s Jungnang-gu
## 3   male 50s   Jongno-gu
## 4   male 20s     Mapo-gu
## 5 female 20s Seongbuk-gu
## 6 female 50s   Jongno-gu

head(complete(mice_patInfoKorea, 3))# 3rd data

##      sex age        city
## 1   male 50s  Gangseo-gu
## 2   male 30s Jungnang-gu
## 3   male 50s   Jongno-gu
## 4   male 20s     Mapo-gu
## 5 female 20s Seongbuk-gu
## 6 female 50s   Jongno-gu

library("lattice")

# 可以看到這裡的填充算法用到的都是"cart" "cart" "cart" 
summary(mice_patInfoKorea)

## Class: mids
## Number of multiple imputations:  3 
## Imputation methods:
##    sex    age   city 
## "cart" "cart" "cart" 
## PredictorMatrix:
##      sex age city
## sex    0   1    1
## age    1   0    1
## city   1   1    0
## Number of logged events:  90 
##   it im  dep meth
## 1  1  1  sex cart
## 2  1  1  age cart
## 3  1  1 city cart
## 4  1  2  sex cart
## 5  1  2  age cart
## 6  1  2 city cart
##                                                                                  out
## 1                                      age20s, cityGuri, cityGyeongsan-si, citySuwon
## 2                       sexmale, cityGunwi-gun, cityGuri, cityGyeryong-si, citySuwon
## 3                                                                    sexmale, age30s
## 4                                       age20s, cityGuri, cityGyeryong-si, citySuwon
## 5 sexfemale, cityGunwi-gun, cityGuri, cityGyeongsan-si, citySuwon, cityYeongdeok-gun
## 6                                                                    sexmale, age20s

# head(mice_patInfoKorea$imp)
stripplot(mice_patInfoKorea, col=c("grey",mdc(2)),pch=c(1,20))

# 分面板觀察，以sex和age做進行分組組合，觀察3組數據的填充情況。
xyplot(mice_patInfoKorea , sex ~  age | .imp, pch=20,cex=1.2)

# 每一個面板為一列的原始數據和3次填充的數據，藍色散點為原始數據取值的分佈，紅色的小點為填充的數據取值的分佈。

# 可以用第一個填充的表為接下來處理的原始表
patInfoKoreaSAC <- complete(mice_patInfoKorea, 1)

# 查看看原始類別的圖，和填完NA值之後的類別分佈
library("ggplot2")

od <- as.data.frame(table(patInfoKorea$age))
colnames(od)<-c("odvar","odfreq")
ne <- as.data.frame(table(patInfoKoreaSAC$age))
colnames(ne)<-c("nevar","nefreq")

ageComp <- cbind(od,ne)
ageComp

##    odvar odfreq nevar nefreq
## 1             0            0
## 2     0s     65    0s     98
## 3   100s      1  100s      1
## 4    10s    177   10s    224
## 5    20s    887   20s   1227
## 6    30s    510   30s    714
## 7    40s    513   40s    746
## 8    50s    665   50s    937
## 9    60s    479   60s    634
## 10   70s    231   70s    286
## 11   80s    170   80s    200
## 12   90s     49   90s     56

ggplot(ageComp, aes(x = odvar, group = 1)) + # You only have to add group = 1 into the ggplot or geom_line aes().
  geom_line(aes(y = odfreq)) +
  geom_line(aes(y = nefreq),color="deepskyblue")

# 其他缺失值的轉換
# + infection_case：etc
# + contact_number：缺失值太多（>50%）刪除，因為可能與本身的存活率關係不大
# + symptom_onset_date（有症狀的日期）：缺失值太多（>50%），有日期的給1，NA值給0
#   symptom_onset_date有日期可能是有比較嚴重的症狀所以可能會影響y的預測，所以給一個label
# + confirmed_date：3 直接刪掉空缺的rows    

patInfo <- cbind(patInfoKoreaSAC,patInfoKoreaOther)
str(patInfo)

## 'data.frame':    5123 obs. of  9 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 3 3 3 2 2 3 3 3 2 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 8 6 8 5 5 8 5 5 6 9 ...
##  $ city              : Factor w/ 163 levels "","Andong-si",..: 42 96 94 98 127 94 94 34 134 127 ...
##  $ province          : Factor w/ 17 levels "Busan","Chungcheongbuk-do",..: 16 16 16 16 16 16 16 16 16 16 ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 35 7 35 7 7 7 35 35 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ contact_number    : num  75 31 17 9 2 43 0 0 68 6 ...
##  $ symptom_onset_date: Date, format: "2020-01-22" NA ...
##  $ confirmed_date    : Date, format: "2020-01-23" "2020-01-30" ...

# 刪掉confirmed_date是NA的rows,去掉contact_number欄位，轉換symptom_onset_date添加label，賦值infection_case
patInfo <- patInfo%>%
  filter(is.na(patInfo$confirmed_date)==F)%>%
# patInfo <- patInfo[!(is.na(patInfo$confirmed_date)==T), ] # 另一種寫法
  select(-c("contact_number"))%>%
  mutate(symptom_onset_date=ifelse(is.na(symptom_onset_date)==T,0,1))%>%
  mutate_if(is.numeric,as.factor)

patInfo$infection_case[is.na(patInfo$infection_case) == T ] <- 'etc'

# 整理完NA值，再次查看NA值
(as.data.frame(table(patInfo$age)))

##    Var1 Freq
## 1          0
## 2    0s   97
## 3  100s    1
## 4   10s  223
## 5   20s 1227
## 6   30s  714
## 7   40s  745
## 8   50s  937
## 9   60s  634
## 10  70s  286
## 11  80s  200
## 12  90s   56

str(patInfo)

## 'data.frame':    5120 obs. of  8 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 3 3 3 2 2 3 3 3 2 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 8 6 8 5 5 8 5 5 6 9 ...
##  $ city              : Factor w/ 163 levels "","Andong-si",..: 42 96 94 98 127 94 94 34 134 127 ...
##  $ province          : Factor w/ 17 levels "Busan","Chungcheongbuk-do",..: 16 16 16 16 16 16 16 16 16 16 ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 35 7 35 7 7 7 35 35 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ symptom_onset_date: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...
##  $ confirmed_date    : Date, format: "2020-01-23" "2020-01-30" ...

head(patInfo)

##      sex age        city province       infection_case    state
## 1   male 50s  Gangseo-gu    Seoul      overseas inflow released
## 2   male 30s Jungnang-gu    Seoul      overseas inflow released
## 3   male 50s   Jongno-gu    Seoul contact with patient released
## 4   male 20s     Mapo-gu    Seoul      overseas inflow released
## 5 female 20s Seongbuk-gu    Seoul contact with patient released
## 6 female 50s   Jongno-gu    Seoul contact with patient released
##   symptom_onset_date confirmed_date
## 1                  1     2020-01-23
## 2                  0     2020-01-30
## 3                  0     2020-01-30
## 4                  1     2020-01-30
## 5                  0     2020-01-31
## 6                  0     2020-01-31

特徵轉換篩選

# 不要city，city太離散了，保留province；也先去掉confirmed_date（目前還沒想到更好的方式轉換它）
patInfoTest <- patInfo %>%
  select(-c("city","confirmed_date"))%>%
  mutate(province = as.character(province))

apply(patInfoTest,2,function(x){sum(is.na(x))})

##                sex                age           province     infection_case 
##                  0                  0                  0                  0 
##              state symptom_onset_date 
##                  0                  0

patInfoTest%>%
  group_by(province,state)%>%
  mutate(counts = n())%>%
  ggplot(aes(reorder(province, +counts),counts,color=state))+
  geom_col()+
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=30, hjust=1))

patInfoTest%>%
  group_by(infection_case,state)%>%
  mutate(counts = n())%>%
  ggplot(aes(reorder(infection_case, +counts),counts,color=state))+
  geom_col()+
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=45, hjust=1))

# 查看province各個類別的數量
prTable <-as.data.frame(table(patInfoTest$province))
prTable

##                 Var1 Freq
## 1              Busan  145
## 2  Chungcheongbuk-do   56
## 3  Chungcheongnam-do  166
## 4              Daegu  131
## 5            Daejeon  119
## 6         Gangwon-do   61
## 7            Gwangju   44
## 8        Gyeonggi-do 1203
## 9   Gyeongsangbuk-do 1249
## 10  Gyeongsangnam-do  133
## 11           Incheon  342
## 12           Jeju-do   14
## 13      Jeollabuk-do   27
## 14      Jeollanam-do   23
## 15            Sejong   51
## 16             Seoul 1302
## 17             Ulsan   54

ggplot(prTable, aes(reorder(Var1, +Freq),Freq))+ 
  geom_col()+ 
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=30, hjust=1))

# 感覺可以保留筆數最多的三個類別，剩下的再給一個類別，像是“etc”
rare_pro <- c("Busan","Chungcheongbuk-do","Chungcheongnam-do", "Daegu", "Daejeon", "Gangwon-do", 
              "Gwangju", "Gyeongsangnam-do", "Incheon", "Jeju-do", "Jeollabuk-do", "Jeollanam-do", "Sejong", "Ulsan")

patInfoTest$province[(patInfoTest$province) %in% rare_pro] <- 'etc'

# 查看infection_case各個類別的數量
infTable <-as.data.frame(table(patInfoTest$infection_case))
infTable

##                                             Var1 Freq
## 1                                                   0
## 2                     Anyang Gunpo Pastors Group    1
## 3                Biblical Language study meeting    3
## 4                    Bonghwa Pureun Nursing Home   31
## 5                       Changnyeong Coin Karaoke    4
## 6                       Cheongdo Daenam Hospital   21
## 7                           contact with patient 1606
## 8                       Coupang Logistics Center   80
## 9                     Daejeon door-to-door sales    1
## 10                               Daezayeon Korea    3
## 11                               Day Care Center   43
## 12                                 Dongan Church   17
## 13                       Dunsan Electronics Town   13
## 14                                           etc 1612
## 15                 Eunpyeong St. Mary's Hospital   15
## 16                         Gangnam Dongin Church    1
## 17                Gangnam Yeoksam-dong gathering    6
## 18                               Geochang Church    6
## 19 Geumcheon-gu rice milling machine manufacture    6
## 20                     Guri Collective Infection    5
## 21                           Guro-gu Call Center  112
## 22         Gyeongsan Cham Joeun Community Center   10
## 23                    Gyeongsan Jeil Silver Town   12
## 24                 Gyeongsan Seorin Nursing Home   15
## 25                       gym facility in Cheonan   30
## 26                        gym facility in Sejong    4
## 27                                 Itaewon Clubs  162
## 28                             KB Life Insurance   13
## 29                Korea Campus Crusade of Christ    7
## 30                                 Milal Shelter   11
## 31              Ministry of Oceans and Fisheries   28
## 32                                 Onchun Church   33
## 33                                   Orange Life    1
## 34                                   Orange Town    7
## 35                               overseas inflow  811
## 36                          Pilgrimage to Israel    2
## 37                                       Richway  128
## 38               River of Grace Community Church    1
## 39               Samsung Fire & Marine Insurance    4
## 40                        Samsung Medical Center    7
## 41                                 Seocho Family    5
## 42                              Seongdong-gu APT   13
## 43         Seoul City Hall Station safety worker    3
## 44                            Shincheonji Church  106
## 45              SMR Newly Planted Churches Group   36
## 46                       Suyeong-gu Kindergarten    3
## 47                       Uiwang Logistics Center    2
## 48                               Wangsung Church   24
## 49                   Yangcheon Table Tennis Club   44
## 50                            Yeonana News Class    5
## 51               Yeongdeungpo Learning Institute    3
## 52                               Yongin Brothers    4

ggplot(infTable, aes(reorder(Var1, +Freq),Freq))+ 
  geom_col()+ 
  theme(text = element_text(size=10), 
     axis.text.x = element_text(angle=30, hjust=1))

str(patInfoTest)

## 'data.frame':    5120 obs. of  6 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 3 3 3 2 2 3 3 3 2 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 8 6 8 5 5 8 5 5 6 9 ...
##  $ province          : chr  "Seoul" "Seoul" "Seoul" "Seoul" ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 35 7 35 7 7 7 35 35 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ symptom_onset_date: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...

找出death最多的infection_case類別是什麼

# province 也轉成 factor
patInfoTest <- patInfoTest %>%
  mutate_if(is.character, as.factor)


# 切資料，注意y可能的資料分佈不平衡問題
library(rsample)  # data splitting 

# 分出訓練集和train 70% 30%
patInfoTestsplit <- initial_split(patInfoTest, prop = .7)
patInfoTest_train <- training(patInfoTestsplit)
patInfoTest_test <- testing(patInfoTestsplit)

patInfoTest_testX <- patInfoTest_test%>%
  select(-c("state"))
patInfoTest_testY <- patInfoTest_test%>%
  select(c("state"))

str(patInfoTest_train)

## 'data.frame':    3584 obs. of  6 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 3 3 2 3 3 3 3 3 2 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 8 8 5 8 5 5 11 10 10 10 ...
##  $ province          : Factor w/ 4 levels "etc","Gyeonggi-do",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 7 35 7 7 35 7 42 7 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ symptom_onset_date: Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 2 1 1 ...

建立模型

資料的重抽樣

注意只在train中做重抽樣

Roadmap（步驟）：

Divide dataset in training set and test set
Choose a machine learning model
Apply SMOTE on training set to balance the class distribution
Train model on re-balanced training set
Test performance on (original) test set

遇到的問題：

error:get.knnx(data, query, k, algorithm) : Data non-numeric smotefamily 輸入的必須是數值的

所以最後沒有做SMOTE，在caret package中有一個 sampling 的參數，我們設為 “up”來做一個簡單的重抽樣

# 解決一些模型不平衡的問題

# library(smotefamily)
# trainFeature <- patInfoTest_train%>%
#   select(-c("state"))
# trainTarge <- patInfoTest_train%>%
#   select(c("state"))
# 
# smote_output = SMOTE(X = trainFeature, target=trainTarge, K = 4, dup_size = 10)
# 
# smote_output

模型的建立

# boosted tree model via the gbm package
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 5,
                           sampling = "up")

# set seed
reset.seed <- function()
{
  # ensure results are repeatable
  set.seed(1337)
}

KNN

# knn
knnFit <- train(state  ~ ., data=patInfoTest_train,  
                method = "knn", trControl = fitControl
                #preProcess = c("center","scale"), tuneLength = 20
                )

#Output of kNN fit
knnFit

## k-Nearest Neighbors 
## 
## 3584 samples
##    5 predictor
##    3 classes: 'deceased', 'isolated', 'released' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3225, 3226, 3226, 3225, 3226, 3225, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7111094  0.4874776
##   7  0.6986613  0.4671362
##   9  0.6969896  0.4654146
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

plot(knnFit)

# Predict with KNN
knnPredict <- predict(knnFit,newdata = patInfoTest_testX )
# Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(knnPredict, patInfoTest_test$state )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction deceased isolated released
##   deceased       15       63      137
##   isolated        1      468      169
##   released        0      110      573
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6875          
##                  95% CI : (0.6637, 0.7106)
##     No Information Rate : 0.5723          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4525          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: deceased Class: isolated Class: released
## Sensitivity                 0.937500          0.7301          0.6519
## Specificity                 0.868421          0.8101          0.8326
## Pos Pred Value              0.069767          0.7335          0.8389
## Neg Pred Value              0.999243          0.8073          0.6413
## Prevalence                  0.010417          0.4173          0.5723
## Detection Rate              0.009766          0.3047          0.3730
## Detection Prevalence        0.139974          0.4154          0.4447
## Balanced Accuracy           0.902961          0.7701          0.7422

DT

# decision tree
train.control <- trainControl(
                           method = "repeatedcv",
                           number = 10, ## 10-fold CV
                           repeats = 10,## repeated three times
                           classProbs = TRUE,
                           sampling = "up"
                           )
# decision tree
dtFit <- train(state  ~ ., data=patInfoTest_train, 
                 method = "rpart2", 
                 tuneLength = 6,
                 trControl = train.control,
                 metric = "ROC"
               )

## Warning in train.default(x, y, weights = w, ...): The metric "ROC" was not in
## the result set. Accuracy will be used instead.

dtFit

## CART 
## 
## 3584 samples
##    5 predictor
##    3 classes: 'deceased', 'isolated', 'released' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3226, 3226, 3225, 3226, 3226, 3224, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   maxdepth  Accuracy   Kappa    
##    1        0.2449747  0.1490424
##    4        0.7454504  0.5052857
##   10        0.7368265  0.4979582
##   11        0.7368265  0.4980333
##   12        0.7368265  0.4980333
##   13        0.7368265  0.4980333
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 4.

plot(dtFit)

# Predict with Decision tree
dtPredict <- predict(dtFit,newdata = patInfoTest_testX )
# Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(dtPredict, patInfoTest_test$state )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction deceased isolated released
##   deceased       11       36       83
##   isolated        0      329       20
##   released        5      276      776
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7266          
##                  95% CI : (0.7035, 0.7487)
##     No Information Rate : 0.5723          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4644          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: deceased Class: isolated Class: released
## Sensitivity                 0.687500          0.5133          0.8828
## Specificity                 0.921711          0.9777          0.5723
## Pos Pred Value              0.084615          0.9427          0.7342
## Neg Pred Value              0.996444          0.7372          0.7850
## Prevalence                  0.010417          0.4173          0.5723
## Detection Rate              0.007161          0.2142          0.5052
## Detection Prevalence        0.084635          0.2272          0.6882
## Balanced Accuracy           0.804605          0.7455          0.7276

GBM

# gbm
gbmFit1 <- train(state  ~ ., data=patInfoTest_train, 
                 method = "gbm", trControl = fitControl,
                 verbose = FALSE)

gbmFit1

## Stochastic Gradient Boosting 
## 
## 3584 samples
##    5 predictor
##    3 classes: 'deceased', 'isolated', 'released' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3226, 3226, 3226, 3225, 3226, 3226, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7593168  0.5415468
##   1                  100      0.7647299  0.5634528
##   1                  150      0.7642828  0.5672738
##   2                   50      0.7679637  0.5694582
##   2                  100      0.7681866  0.5743344
##   2                  150      0.7701406  0.5793887
##   3                   50      0.7695814  0.5750767
##   3                  100      0.7689138  0.5765341
##   3                  150      0.7698092  0.5796713
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  2, shrinkage = 0.1 and n.minobsinnode = 10.

# Predict with GBM
gbmPredict <- predict(gbmFit1,newdata = patInfoTest_testX )
# Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(gbmPredict, patInfoTest_test$state )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction deceased isolated released
##   deceased       14       42       95
##   isolated        2      469      114
##   released        0      130      670
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7507          
##                  95% CI : (0.7282, 0.7721)
##     No Information Rate : 0.5723          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5399          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: deceased Class: isolated Class: released
## Sensitivity                 0.875000          0.7317          0.7622
## Specificity                 0.909868          0.8704          0.8021
## Pos Pred Value              0.092715          0.8017          0.8375
## Neg Pred Value              0.998556          0.8191          0.7160
## Prevalence                  0.010417          0.4173          0.5723
## Detection Rate              0.009115          0.3053          0.4362
## Detection Prevalence        0.098307          0.3809          0.5208
## Balanced Accuracy           0.892434          0.8010          0.7822

——– 隨機森林一直跑不出來，先放在一邊 ———-

# 隨機森林
# train.control <- trainControl(
#                            method = "repeatedcv",
#                            number = 10, ## 10-fold CV
#                            repeats = 5,## repeated three times
#                            classProbs = TRUE,
#                            sampling = "up"
#                            )
# 
# set.seed(100)
# rfFit <- train(state  ~ ., data=patInfoTest_train, method='rf', tuneLength=5,
#                     trControl=train.control)
# rfFit
# plot(rfFit)

# # Predict with RF
# rfPredict <- predict(rfFit,newdata = patInfoTest_testX )
# # Get the confusion matrix to see accuracy value and other parameter values
# confusionMatrix(rfPredict, patInfoTest_test$state )

模型的評估

# KNN
# Confusion Matrix and Statistics
# 
#           Reference
# Prediction deceased isolated released
#   deceased       15       58      123
#   isolated        1      467      176
#   released        0      116      580
# 
# Overall Statistics
#                                           
# -------------- Accuracy : 0.6914 --------------          
#                  95% CI : (0.6676, 0.7144)
#     No Information Rate : 0.5723          
#     P-Value [Acc > NIR] : < 2.2e-16       
#                                           
#                   Kappa : 0.4532          
#                                           
#  Mcnemar's Test P-Value : < 2.2e-16       

# DT
# Confusion Matrix and Statistics
# 
#           Reference
# Prediction deceased isolated released
#   deceased       11       36       83
#   isolated        0      329       20
#   released        5      276      776
# 
# Overall Statistics
#                                           
# -------------- Accuracy : 0.7266 --------------         
#                  95% CI : (0.7035, 0.7487)
#     No Information Rate : 0.5723          
#     P-Value [Acc > NIR] : < 2.2e-16       
#                                           
#                   Kappa : 0.4644          
#                                           
#  Mcnemar's Test P-Value : < 2.2e-16       

# GBM
# Confusion Matrix and Statistics
# 
#           Reference
# Prediction deceased isolated released
#   deceased       14       48       99
#   isolated        0      435       97
#   released        2      158      683
# 
# Overall Statistics
#                                           
# -------------- Accuracy : 0.737 --------------           
#                  95% CI : (0.7142, 0.7589)
#     No Information Rate : 0.5723          
#     P-Value [Acc > NIR] : < 2.2e-16       
#                                           
#                   Kappa : 0.5132          
#                                           
#  Mcnemar's Test P-Value : < 2.2e-16

只看Accuracy的話最好的是GMB模型。

但同時我們希望這個模型可以更好的預測出可能死亡的案例，這樣才不會耽誤他的治療。那麼我們可以看到實際死亡，而預測也為死亡的最為準確的其實是KNN。

進一步分析

看看分類錯誤的案例是哪些

三個模型：

將預測值和正式值合併，找到預測錯誤，即兩個欄位不一致的rows。

分析這些欄位都有什麼特徵，同時可以看看三個模型對於預測錯位的案例有什麼共性和差異性

# 三個模型的預測值：knnPredict；dtPredict；gbmPredict 
# 原始資料：patInfoTest_test$state 
head(patInfoTest_test)

##       sex age province       infection_case    state symptom_onset_date
## 2    male 30s    Seoul      overseas inflow released                  0
## 5  female 20s    Seoul contact with patient released                  0
## 8    male 20s    Seoul      overseas inflow released                  0
## 9    male 30s    Seoul      overseas inflow released                  0
## 10 female 60s    Seoul contact with patient released                  0
## 13 female 60s    Seoul contact with patient released                  1

patInfoTest_compare <- patInfoTest_test %>%
  bind_cols(knnPredict)%>%
  bind_cols(dtPredict)%>%
  bind_cols(gbmPredict)

## New names:
## * NA -> ...7

## New names:
## * NA -> ...8

## New names:
## * NA -> ...9

patInfoTest_compare <- patInfoTest_compare %>%
  rename(knnPredict =...7,
         dtPredict = ...8,
         gbmPredict =...9)

str(patInfoTest_compare)

## 'data.frame':    1536 obs. of  9 variables:
##  $ sex               : Factor w/ 3 levels "","female","male": 3 2 3 3 2 2 3 3 2 3 ...
##  $ age               : Factor w/ 12 levels "","0s","100s",..: 6 5 5 6 9 9 10 5 10 11 ...
##  $ province          : Factor w/ 4 levels "etc","Gyeonggi-do",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ infection_case    : Factor w/ 52 levels "","Anyang Gunpo Pastors Group",..: 35 7 35 35 7 7 7 14 42 7 ...
##  $ state             : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ symptom_onset_date: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  $ knnPredict        : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 2 2 3 1 ...
##  $ dtPredict         : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 1 3 1 1 ...
##  $ gbmPredict        : Factor w/ 3 levels "deceased","isolated",..: 3 3 3 3 3 3 3 3 3 1 ...

# 找到KNN中預測對的和預測錯誤的案例
patInfoTest_compKNN <- patInfoTest_compare%>%
  mutate(knnError = ifelse(state==knnPredict,1,0))%>% # 建立一个新的label，便于之后filter
  filter(knnError==0)

patInfoTest_compKNN %>%
  count(province)%>%
  ggplot(aes(reorder(province, +n),n))+
  geom_col()+
  theme(text = element_text(size=10),
     axis.text.x = element_text(angle=45, hjust=1))

patInfoTest_compKNN %>%
  count(age)%>%
  ggplot(aes(age,n))+
  geom_col()+
  theme(text = element_text(size=10),
     axis.text.x = element_text(angle=45, hjust=1))

patInfoTest_compKNN %>%
  count(infection_case)%>%
  ggplot(aes(reorder(infection_case, +n),n))+
  geom_col()+
  theme(text = element_text(size=10),
     axis.text.x = element_text(angle=45, hjust=1))

patInfoTest_compKNN %>%
  count(province, age)%>%
  ggplot(aes(age,n,fill=province))+
  geom_col()+
  theme(text = element_text(size=10),
     axis.text.x = element_text(angle=45, hjust=1))