Titanic 資料集的預測及模型評估

本篇主要針對Titanic資料集進行資料的前處理、模型的建立、模型的預測效果分析。

比較著重在於模型建立好後的評估以及各個模型之間的比較，所以前面對於資料集的探索性分析以及視覺化的部分比較少。

資料處理流程

資料前處理

簡單的Feature Engineering
- 只保留Cabin的艙位號（前面的字母）。
- 把Name中的有一定含義的 title 元素提取出來，並將比較少用的title合併到比較常用的tittle中，建立一個新的類別“Title”
- 把姓氏提取出來，創建新的類別“Surname”
Missing Data
- NA值 & 空白值
減少資料量
- 屬性的篩選：刪掉不要的屬性
正規化處理

模型的建立

隨機森林（Random Forest）
SVM（Support Vector Machines）
GBM（Gradient Boosting Machine）

模型的解釋

累積殘差分佈
變數的重要性

預測及分析

資料前處理

載入 & check

載入 packages

# 載入 packages
library('data.table') # input data
library('ggplot2') # visualization
library('ggthemes') # visualization
library('scales') # visualization
library("GGally") # plot data correlation

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library('dplyr') # data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('tidyr')

library('caret') # train model

## Loading required package: lattice

library('randomForest') # classification algorithm

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library('mice') # imputation

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library('pROC') # ROC curves

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# 這裡直接利用DALEX包的解釋函數對三個模型進行解釋性分析。
library("tidyverse")

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ tibble  3.1.0     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()        masks data.table::between()
## x readr::col_factor()     masks scales::col_factor()
## x randomForest::combine() masks dplyr::combine()
## x purrr::discard()        masks scales::discard()
## x mice::filter()          masks dplyr::filter(), stats::filter()
## x dplyr::first()          masks data.table::first()
## x dplyr::lag()            masks stats::lag()
## x dplyr::last()           masks data.table::last()
## x purrr::lift()           masks caret::lift()
## x randomForest::margin()  masks ggplot2::margin()
## x purrr::transpose()      masks data.table::transpose()

library("DALEX")

## Welcome to DALEX (version: 2.2.0).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies

## 
## Attaching package: 'DALEX'

## The following object is masked from 'package:dplyr':
## 
##     explain

載入 data

# 載入 data
titanicTrainData = fread('/Users/a1234/Desktop/SCHOOL/碩班第二學期/Data mining/HW/titanic_disaster/train.csv',
                         encoding ='UTF-8',header = T, stringsAsFactors = F)
titanicTestData = fread('/Users/a1234/Desktop/SCHOOL/碩班第二學期/Data mining/HW/titanic_disaster/test.csv',encoding = 'UTF-8',
                    header = T, stringsAsFactors = F)

填補NA值，將test和train合併一起做前處理

# 查看看一下 test 的資料集，填補NA值然後合到一起處理
apply(titanicTestData,2,function(x){sum(is.na(x))}) # Age:86, Fare:1

## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0          86           0 
##       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           1           0           0

titanicTestData$Survived <- NA

# 把train和test合在一起處理 
titanicData = rbind(titanicTrainData,titanicTestData) # train：1:819

# 查看資料的基本型態
head(titanicData )

##    PassengerId Survived Pclass
## 1:           1        0      3
## 2:           2        1      1
## 3:           3        1      3
## 4:           4        1      1
## 5:           5        0      3
## 6:           6        0      3
##                                                   Name    Sex Age SibSp Parch
## 1:                             Braund, Mr. Owen Harris   male  22     1     0
## 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3:                              Heikkinen, Miss. Laina female  26     0     0
## 4:        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5:                            Allen, Mr. William Henry   male  35     0     0
## 6:                                    Moran, Mr. James   male  NA     0     0
##              Ticket    Fare Cabin Embarked
## 1:        A/5 21171  7.2500              S
## 2:         PC 17599 71.2833   C85        C
## 3: STON/O2. 3101282  7.9250              S
## 4:           113803 53.1000  C123        S
## 5:           373450  8.0500              S
## 6:           330877  8.4583              Q

str(titanicData )

## Classes 'data.table' and 'data.frame':   1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
##  - attr(*, ".internal.selfref")=<externalptr>

# 查看個人資料各欄位資料的狀態
summary(titanicData)

##   PassengerId      Survived          Pclass          Name          
##  Min.   :   1   Min.   :0.0000   Min.   :1.000   Length:1309       
##  1st Qu.: 328   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median : 655   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   : 655   Mean   :0.3838   Mean   :2.295                     
##  3rd Qu.: 982   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :1309   Max.   :1.0000   Max.   :3.000                     
##                 NA's   :418                                        
##      Sex                 Age            SibSp            Parch      
##  Length:1309        Min.   : 0.17   Min.   :0.0000   Min.   :0.000  
##  Class :character   1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.000  
##                     Mean   :29.88   Mean   :0.4989   Mean   :0.385  
##                     3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.000  
##                     Max.   :80.00   Max.   :8.0000   Max.   :9.000  
##                     NA's   :263                                     
##     Ticket               Fare            Cabin             Embarked        
##  Length:1309        Min.   :  0.000   Length:1309        Length:1309       
##  Class :character   1st Qu.:  7.896   Class :character   Class :character  
##  Mode  :character   Median : 14.454   Mode  :character   Mode  :character  
##                     Mean   : 33.295                                        
##                     3rd Qu.: 31.275                                        
##                     Max.   :512.329                                        
##                     NA's   :1

# 查看各個倉位的各種類數量及所佔比例
titanicData %>% 
  group_by(Pclass) %>% 
  summarise(n = n(), ratio = n()/ nrow(titanicData))

## # A tibble: 3 x 3
##   Pclass     n ratio
##    <int> <int> <dbl>
## 1      1   323 0.247
## 2      2   277 0.212
## 3      3   709 0.542

一些簡單的feature engineering

只保留Cabin的艙位號（前面的字母）

# Cabin 是艙位號，我們只要保留前面的字母就可以了
# strsplit(x, NULL)[[1]][1] 將x用NULL分開，並提取第一欄中的提一個元素
titanicData$Cabin<- sapply(titanicData$Cabin,
                                function(x){strsplit(x, NULL)[[1]][1]})
head(titanicData)

##    PassengerId Survived Pclass
## 1:           1        0      3
## 2:           2        1      1
## 3:           3        1      3
## 4:           4        1      1
## 5:           5        0      3
## 6:           6        0      3
##                                                   Name    Sex Age SibSp Parch
## 1:                             Braund, Mr. Owen Harris   male  22     1     0
## 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3:                              Heikkinen, Miss. Laina female  26     0     0
## 4:        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5:                            Allen, Mr. William Henry   male  35     0     0
## 6:                                    Moran, Mr. James   male  NA     0     0
##              Ticket    Fare Cabin Embarked
## 1:        A/5 21171  7.2500  <NA>        S
## 2:         PC 17599 71.2833     C        C
## 3: STON/O2. 3101282  7.9250  <NA>        S
## 4:           113803 53.1000     C        S
## 5:           373450  8.0500  <NA>        S
## 6:           330877  8.4583  <NA>        Q

把Name中的有一定含義的 title 元素提取出來

# 把Name中的有一定含義的 title 元素提取出來
# gsub(pattern = "i",replacement = "l", x = race) 把x中的i都用l來代替
# Mr，Miss ......
titanicData$Title <- gsub('(.*, )|(\\..*)', '', titanicData$Name)
head(titanicData)

##    PassengerId Survived Pclass
## 1:           1        0      3
## 2:           2        1      1
## 3:           3        1      3
## 4:           4        1      1
## 5:           5        0      3
## 6:           6        0      3
##                                                   Name    Sex Age SibSp Parch
## 1:                             Braund, Mr. Owen Harris   male  22     1     0
## 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3:                              Heikkinen, Miss. Laina female  26     0     0
## 4:        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5:                            Allen, Mr. William Henry   male  35     0     0
## 6:                                    Moran, Mr. James   male  NA     0     0
##              Ticket    Fare Cabin Embarked Title
## 1:        A/5 21171  7.2500  <NA>        S    Mr
## 2:         PC 17599 71.2833     C        C   Mrs
## 3: STON/O2. 3101282  7.9250  <NA>        S  Miss
## 4:           113803 53.1000     C        S   Mrs
## 5:           373450  8.0500  <NA>        S    Mr
## 6:           330877  8.4583  <NA>        Q    Mr

# 將比較少用的title合併到比較常用的tittle中
# 不同性別的title的差異
table(titanicData$Sex, titanicData$Title)

##         
##          Capt Col Don Dona  Dr Jonkheer Lady Major Master Miss Mlle Mme  Mr Mrs
##   female    0   0   0    1   1        0    1     0      0  260    2   1   0 197
##   male      1   4   1    0   7        1    0     2     61    0    0   0 757   0
##         
##           Ms Rev Sir the Countess
##   female   2   0   0            1
##   male     0   8   1            0

# 把比較少的title整合起來
rare_title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 
                'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')
titanicData$Title[titanicData$Title == 'Mlle']        <- 'Miss' 
titanicData$Title[titanicData$Title == 'Ms']          <- 'Miss'
titanicData$Title[titanicData$Title == 'Mme']         <- 'Mrs' 
titanicData$Title[titanicData$Title %in% rare_title]  <- 'Rare Title'

# 整合後不同性別的title的差異
table(titanicData$Sex, titanicData$Title)

##         
##          Master Miss  Mr Mrs Rare Title
##   female      0  264   0 198          4
##   male       61    0 757   0         25

把姓氏提取出來，建立一個新的類別“Surname”

# 把姓氏提取出來
titanicData$Surname <- sapply(titanicData$Name,
                      function(x){strsplit(x, '[,.]')[[1]][1]})
head(titanicData)

##    PassengerId Survived Pclass
## 1:           1        0      3
## 2:           2        1      1
## 3:           3        1      3
## 4:           4        1      1
## 5:           5        0      3
## 6:           6        0      3
##                                                   Name    Sex Age SibSp Parch
## 1:                             Braund, Mr. Owen Harris   male  22     1     0
## 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3:                              Heikkinen, Miss. Laina female  26     0     0
## 4:        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5:                            Allen, Mr. William Henry   male  35     0     0
## 6:                                    Moran, Mr. James   male  NA     0     0
##              Ticket    Fare Cabin Embarked Title   Surname
## 1:        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2:         PC 17599 71.2833     C        C   Mrs   Cumings
## 3: STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4:           113803 53.1000     C        S   Mrs  Futrelle
## 5:           373450  8.0500  <NA>        S    Mr     Allen
## 6:           330877  8.4583  <NA>        Q    Mr     Moran

缺失值和空值的處理 missing value

# 查看資料是否有缺值、NA值
sum(is.na(titanicData))

## [1] 1696

# 查看各個欄位的missing data
missingData <- apply(titanicData,2,function(x){sum(is.na(x))})
missingData

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1        1014           0 
##       Title     Surname 
##           0           0

👆缺失值主要來自NA和Cabin

可以用rpart (recursive partitioning for regression) 這個package來預測age,也可以用mice package來填補，這邊我們選擇的是mice. 用MICE填補遺漏值:

# 先查看當前資料型態
str(titanicData)

## Classes 'data.table' and 'data.frame':   1309 obs. of  14 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  NA "C" NA "C" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
##  $ Title      : chr  "Mr" "Mrs" "Miss" "Mrs" ...
##  $ Surname    : chr  "Braund" "Cumings" "Heikkinen" "Futrelle" ...
##  - attr(*, ".internal.selfref")=<externalptr>

# 用mice package來填補age的missing value
# 先把 Survived這個欄位分出來
F_titanicData <- titanicData[,!c("Survived")]
T_titanicData <- titanicData[,c("Survived")]
head(F_titanicData)

##    PassengerId Pclass                                                Name
## 1:           1      3                             Braund, Mr. Owen Harris
## 2:           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer)
## 3:           3      3                              Heikkinen, Miss. Laina
## 4:           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel)
## 5:           5      3                            Allen, Mr. William Henry
## 6:           6      3                                    Moran, Mr. James
##       Sex Age SibSp Parch           Ticket    Fare Cabin Embarked Title
## 1:   male  22     1     0        A/5 21171  7.2500  <NA>        S    Mr
## 2: female  38     1     0         PC 17599 71.2833     C        C   Mrs
## 3: female  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss
## 4: female  35     1     0           113803 53.1000     C        S   Mrs
## 5:   male  35     0     0           373450  8.0500  <NA>        S    Mr
## 6:   male  NA     0     0           330877  8.4583  <NA>        Q    Mr
##      Surname
## 1:    Braund
## 2:   Cumings
## 3: Heikkinen
## 4:  Futrelle
## 5:     Allen
## 6:     Moran

mice_titanicData <- mice(F_titanicData,
                  m = 3,           # 產生三個被填補好的資料表
                  maxit = 50,      # max iteration
                  method = "cart", # 使用CART決策樹，進行遺漏值預測
                  seed = 188)      # set.seed()，令抽樣每次都一樣

## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   6   1  Age  Fare
##   6   2  Age  Fare
##   6   3  Age  Fare
##   7   1  Age  Fare
##   7   2  Age  Fare
##   7   3  Age  Fare
##   8   1  Age  Fare
##   8   2  Age  Fare
##   8   3  Age  Fare
##   9   1  Age  Fare
##   9   2  Age  Fare
##   9   3  Age  Fare
##   10   1  Age  Fare
##   10   2  Age  Fare
##   10   3  Age  Fare
##   11   1  Age  Fare
##   11   2  Age  Fare
##   11   3  Age  Fare
##   12   1  Age  Fare
##   12   2  Age  Fare
##   12   3  Age  Fare
##   13   1  Age  Fare
##   13   2  Age  Fare
##   13   3  Age  Fare
##   14   1  Age  Fare
##   14   2  Age  Fare
##   14   3  Age  Fare
##   15   1  Age  Fare
##   15   2  Age  Fare
##   15   3  Age  Fare
##   16   1  Age  Fare
##   16   2  Age  Fare
##   16   3  Age  Fare
##   17   1  Age  Fare
##   17   2  Age  Fare
##   17   3  Age  Fare
##   18   1  Age  Fare
##   18   2  Age  Fare
##   18   3  Age  Fare
##   19   1  Age  Fare
##   19   2  Age  Fare
##   19   3  Age  Fare
##   20   1  Age  Fare
##   20   2  Age  Fare
##   20   3  Age  Fare
##   21   1  Age  Fare
##   21   2  Age  Fare
##   21   3  Age  Fare
##   22   1  Age  Fare
##   22   2  Age  Fare
##   22   3  Age  Fare
##   23   1  Age  Fare
##   23   2  Age  Fare
##   23   3  Age  Fare
##   24   1  Age  Fare
##   24   2  Age  Fare
##   24   3  Age  Fare
##   25   1  Age  Fare
##   25   2  Age  Fare
##   25   3  Age  Fare
##   26   1  Age  Fare
##   26   2  Age  Fare
##   26   3  Age  Fare
##   27   1  Age  Fare
##   27   2  Age  Fare
##   27   3  Age  Fare
##   28   1  Age  Fare
##   28   2  Age  Fare
##   28   3  Age  Fare
##   29   1  Age  Fare
##   29   2  Age  Fare
##   29   3  Age  Fare
##   30   1  Age  Fare
##   30   2  Age  Fare
##   30   3  Age  Fare
##   31   1  Age  Fare
##   31   2  Age  Fare
##   31   3  Age  Fare
##   32   1  Age  Fare
##   32   2  Age  Fare
##   32   3  Age  Fare
##   33   1  Age  Fare
##   33   2  Age  Fare
##   33   3  Age  Fare
##   34   1  Age  Fare
##   34   2  Age  Fare
##   34   3  Age  Fare
##   35   1  Age  Fare
##   35   2  Age  Fare
##   35   3  Age  Fare
##   36   1  Age  Fare
##   36   2  Age  Fare
##   36   3  Age  Fare
##   37   1  Age  Fare
##   37   2  Age  Fare
##   37   3  Age  Fare
##   38   1  Age  Fare
##   38   2  Age  Fare
##   38   3  Age  Fare
##   39   1  Age  Fare
##   39   2  Age  Fare
##   39   3  Age  Fare
##   40   1  Age  Fare
##   40   2  Age  Fare
##   40   3  Age  Fare
##   41   1  Age  Fare
##   41   2  Age  Fare
##   41   3  Age  Fare
##   42   1  Age  Fare
##   42   2  Age  Fare
##   42   3  Age  Fare
##   43   1  Age  Fare
##   43   2  Age  Fare
##   43   3  Age  Fare
##   44   1  Age  Fare
##   44   2  Age  Fare
##   44   3  Age  Fare
##   45   1  Age  Fare
##   45   2  Age  Fare
##   45   3  Age  Fare
##   46   1  Age  Fare
##   46   2  Age  Fare
##   46   3  Age  Fare
##   47   1  Age  Fare
##   47   2  Age  Fare
##   47   3  Age  Fare
##   48   1  Age  Fare
##   48   2  Age  Fare
##   48   3  Age  Fare
##   49   1  Age  Fare
##   49   2  Age  Fare
##   49   3  Age  Fare
##   50   1  Age  Fare
##   50   2  Age  Fare
##   50   3  Age  Fare

## Warning: Number of logged events: 7

# 填補好的資料：因為m=3，所以會有三個填補好的資料集，可以用以下方式取出
head(complete(mice_titanicData, 1)) # 1st data

##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3           3      3                              Heikkinen, Miss. Laina female
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5           5      3                            Allen, Mr. William Henry   male
## 6           6      3                                    Moran, Mr. James   male
##   Age SibSp Parch           Ticket    Fare Cabin Embarked Title   Surname
## 1  22     1     0        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2  38     1     0         PC 17599 71.2833     C        C   Mrs   Cumings
## 3  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4  35     1     0           113803 53.1000     C        S   Mrs  Futrelle
## 5  35     0     0           373450  8.0500  <NA>        S    Mr     Allen
## 6  20     0     0           330877  8.4583  <NA>        Q    Mr     Moran

head(complete(mice_titanicData, 2))# 2nd data

##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3           3      3                              Heikkinen, Miss. Laina female
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5           5      3                            Allen, Mr. William Henry   male
## 6           6      3                                    Moran, Mr. James   male
##   Age SibSp Parch           Ticket    Fare Cabin Embarked Title   Surname
## 1  22     1     0        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2  38     1     0         PC 17599 71.2833     C        C   Mrs   Cumings
## 3  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4  35     1     0           113803 53.1000     C        S   Mrs  Futrelle
## 5  35     0     0           373450  8.0500  <NA>        S    Mr     Allen
## 6  20     0     0           330877  8.4583  <NA>        Q    Mr     Moran

head(complete(mice_titanicData, 3))# 3rd data

##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3           3      3                              Heikkinen, Miss. Laina female
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5           5      3                            Allen, Mr. William Henry   male
## 6           6      3                                    Moran, Mr. James   male
##   Age SibSp Parch           Ticket    Fare Cabin Embarked Title   Surname
## 1  22     1     0        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2  38     1     0         PC 17599 71.2833     C        C   Mrs   Cumings
## 3  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4  35     1     0           113803 53.1000     C        S   Mrs  Futrelle
## 5  35     0     0           373450  8.0500  <NA>        S    Mr     Allen
## 6  22     0     0           330877  8.4583  <NA>        Q    Mr     Moran

# e.g. 拿第1個資料，作為我後續分析的資料 
M_titanicData <- complete(mice_titanicData, 1) # 1st data

# 把 Survived 這個欄位合起來進行後續的處理
M_titanicData <- cbind(M_titanicData, T_titanicData)
head(M_titanicData)

##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3           3      3                              Heikkinen, Miss. Laina female
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5           5      3                            Allen, Mr. William Henry   male
## 6           6      3                                    Moran, Mr. James   male
##   Age SibSp Parch           Ticket    Fare Cabin Embarked Title   Surname
## 1  22     1     0        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2  38     1     0         PC 17599 71.2833     C        C   Mrs   Cumings
## 3  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4  35     1     0           113803 53.1000     C        S   Mrs  Futrelle
## 5  35     0     0           373450  8.0500  <NA>        S    Mr     Allen
## 6  20     0     0           330877  8.4583  <NA>        Q    Mr     Moran
##   Survived
## 1        0
## 2        1
## 3        1
## 4        1
## 5        0
## 6        0

# 再次查看空值和空白值
missingData <- apply(M_titanicData,2,function(x){sum(is.na(x))})
missingData

## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0           0           0 
##       Parch      Ticket        Fare       Cabin    Embarked       Title 
##           0           0           0        1014           0           0 
##     Surname    Survived 
##           0         418

# 在前面用str(titanicData )查看資料型態的時候發現 Cabin 有很多是空白值""
# 查看各個欄位包含的空白值
blankData <- apply(M_titanicData,2,function(x){sum((x=="")==TRUE)})
blankData

## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0           0           0 
##       Parch      Ticket        Fare       Cabin    Embarked       Title 
##           0           0           0          NA           2           0 
##     Surname    Survived 
##           0          NA

NA值目前就只存在test中的y值和Cabin中了

空值只有Embarked中有2個空值

# 用Embarked中最多的屬性來填補空值（最多的是S）
table(M_titanicData$Embarked) # S=914

## 
##       C   Q   S 
##   2 270 123 914

M_titanicData[c(which(M_titanicData$Embarked=="")),"Embarked"] ="S"
table(M_titanicData$Embarked) # S=916

## 
##   C   Q   S 
## 270 123 916

head(M_titanicData)

##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3           3      3                              Heikkinen, Miss. Laina female
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5           5      3                            Allen, Mr. William Henry   male
## 6           6      3                                    Moran, Mr. James   male
##   Age SibSp Parch           Ticket    Fare Cabin Embarked Title   Surname
## 1  22     1     0        A/5 21171  7.2500  <NA>        S    Mr    Braund
## 2  38     1     0         PC 17599 71.2833     C        C   Mrs   Cumings
## 3  26     0     0 STON/O2. 3101282  7.9250  <NA>        S  Miss Heikkinen
## 4  35     1     0           113803 53.1000     C        S   Mrs  Futrelle
## 5  35     0     0           373450  8.0500  <NA>        S    Mr     Allen
## 6  20     0     0           330877  8.4583  <NA>        Q    Mr     Moran
##   Survived
## 1        0
## 2        1
## 3        1
## 4        1
## 5        0
## 6        0

減少資料量

刪掉不重要的屬性

Name & Surname：類別太多了，並且沒有什麼特別的用途。
Ticket：裡面都是一些隨機的數字，沒有特多含義，並且是多值屬性不好處理。
Cabin：有太多的NA值，並且為類別變數不好填充，如果給一個Ncabin的類別，會使得屬性非常unbalance
PassengerId：只是一個序列號

# datacol <- M_titanicData%>%
#   select(c("Cabin","Fare","Embarked"))
# ggpairs(datacol)

# 刪除不需要的屬性
M_titanicData <- M_titanicData %>%
  select(!c("Name","Ticket","Cabin","PassengerId","Surname"))
head(M_titanicData)

##   Pclass    Sex Age SibSp Parch    Fare Embarked Title Survived
## 1      3   male  22     1     0  7.2500        S    Mr        0
## 2      1 female  38     1     0 71.2833        C   Mrs        1
## 3      3 female  26     0     0  7.9250        S  Miss        1
## 4      1 female  35     1     0 53.1000        S   Mrs        1
## 5      3   male  35     0     0  8.0500        S    Mr        0
## 6      3   male  20     0     0  8.4583        Q    Mr        0

正規化數值變數

數值變量的range沒有特別的大，所以沒有做特別的正規化處理

# normalize <- function(x){(x- mean(x))/ sd(x)}
# M_titanicData <- M_titanicData %>%
#   mutate(fare = normalize(Fare))%>%
#   mutate(age = normalize(Age))
# M_titanicData

建立模型

我們先把前處理好的資料集切開為train和test

# 把train和test的模型切開
trainTitanicData <- M_titanicData[1:891,]
testTitanicData <- M_titanicData[892:nrow(M_titanicData),]

# 再把train中分出train和validation另個集合70%/30%（因為test沒有y值，沒辦法驗證）
# 分成70%作為train，30%作為validation
set.seed(100)
index_row <- sample(2, nrow(trainTitanicData), replace = T, 
                    prob = c(0.7, 0.3)) # assign values to the rows (1: Training, 2: Test)
train_data <- trainTitanicData[index_row == 1,]
valid_data <- trainTitanicData[index_row == 2,]

隨機森林

# 隨機森林
set.seed(100)
titanic_rf <- train(factor(Survived) ~ ., data=train_data, method='rf', tuneLength=10,
                    trControl=trainControl(method="cv", number=5))
titanic_rf

## Random Forest 
## 
## 610 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 487, 488, 488, 489, 488 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8426452  0.6639845
##    3    0.8475368  0.6749392
##    4    0.8392994  0.6585823
##    5    0.8392588  0.6592384
##    6    0.8375928  0.6559679
##    7    0.8294225  0.6388409
##    8    0.8228649  0.6267748
##    9    0.8228651  0.6267118
##   10    0.8212122  0.6235291
##   12    0.8245449  0.6307439
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

plot(titanic_rf)

SVM

These requirements exist because whenever caret::train() runs a tree-based model (here random forests), it converts the factor levels into variables which are used to split the tree. Hence factor level labels become variable names.

So for each of these variables, you can convert the level names to valid labels with the following code.

<!-- trainData %>%  -->
<!--   mutate(Education = factor(Education,  -->
<!--           labels = make.names(levels(Education)))) -->

# 跑SVM需要吧y的部分轉化favtor
SVMtrain_data <- train_data %>% 
  mutate(Survived = factor(Survived, 
          labels = c("yes","no")))
head(SVMtrain_data)

##   Pclass    Sex Age SibSp Parch    Fare Embarked Title Survived
## 1      3   male  22     1     0  7.2500        S    Mr      yes
## 2      1 female  38     1     0 71.2833        C   Mrs       no
## 3      3 female  26     0     0  7.9250        S  Miss       no
## 4      1 female  35     1     0 53.1000        S   Mrs       no
## 5      3   male  35     0     0  8.0500        S    Mr      yes
## 6      3   male  20     0     0  8.4583        Q    Mr      yes

# SVM
set.seed(100)
titanic_svm <- train(factor(Survived) ~., data=SVMtrain_data, method='svmRadial', tuneLength=10,
                     preProcess= c('center', 'scale'), trControl= trainControl(method="cv", number=5, classProbs = T))
titanic_svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 610 samples
##   8 predictor
##   2 classes: 'yes', 'no' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 488, 489, 488, 488, 487 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##     0.25  0.8393536  0.6587864
##     0.50  0.8393267  0.6573537
##     1.00  0.8360478  0.6494731
##     2.00  0.8278780  0.6329454
##     4.00  0.8196546  0.6181568
##     8.00  0.8130970  0.6036037
##    16.00  0.8048865  0.5852329
##    32.00  0.7950500  0.5621164
##    64.00  0.7933836  0.5570771
##   128.00  0.7884518  0.5448537
## 
## Tuning parameter 'sigma' was held constant at a value of 0.2828407
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.2828407 and C = 0.25.

plot(titanic_svm)

(BM)

# (GBM) model
set.seed(100)
titanic_gbm<- train(factor(Survived) ~., data=train_data, method='gbm', preProcess= c('center', 'scale'),tuneLength=5,
                    trControl=trainControl(method="cv", number=7), verbose=FALSE)
print(titanic_gbm)

## Stochastic Gradient Boosting 
## 
## 610 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 522, 523, 522, 524, 524, 523, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8163966  0.6088355
##   1                  100      0.8296101  0.6391059
##   1                  150      0.8213991  0.6208928
##   1                  200      0.8230602  0.6244134
##   1                  250      0.8230602  0.6244400
##   2                   50      0.8246836  0.6283279
##   2                  100      0.8181710  0.6141555
##   2                  150      0.8214741  0.6224873
##   2                  200      0.8296474  0.6399608
##   2                  250      0.8232108  0.6255341
##   3                   50      0.8263634  0.6312884
##   3                  100      0.8361596  0.6532392
##   3                  150      0.8378394  0.6580040
##   3                  200      0.8427850  0.6678363
##   3                  250      0.8427659  0.6672287
##   4                   50      0.8296852  0.6400086
##   4                  100      0.8378585  0.6578734
##   4                  150      0.8379158  0.6577285
##   4                  200      0.8362542  0.6553899
##   4                  250      0.8313090  0.6443476
##   5                   50      0.8230224  0.6259848
##   5                  100      0.8280058  0.6375573
##   5                  150      0.8345557  0.6510954
##   5                  200      0.8280245  0.6377047
##   5                  250      0.8312521  0.6449659
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.

plot(titanic_gbm)

模型的解釋

這裡直接利用DALEX包的解釋函數對三個模型進行解釋性分析。

# 需要載入的包（在前面已經載入，這邊只是做紀錄）
# library("tidyverse")
# library("caret")
# library("DALEX")

# 做這個分析需要包含4個信息：模型信息；標籤信息（如果沒有，會自動從模型撤除）；驗證數據集；驗證數據集中的是響應變量
# titanic_rf; titanic_svm; titanic_gbm
explainer_rf <- explain(titanic_rf,label = "rf",
                     data = valid_data,
                     y = valid_data$Survived)

## Preparation of a new explainer is initiated
##   -> model label       :  rf 
##   -> data              :  281  rows  9  cols 
##   -> target variable   :  281  values 
##   -> predict function  :  yhat.train  will be used ( [33m default [39m )
##   -> predicted values  :  No value for predict function target column. ( [33m default [39m )
##   -> model_info        :  package caret , ver. 6.0.86 , task classification ( [33m default [39m ) 
##   -> predicted values  :  numerical, min =  0 , mean =  0.3820712 , max =  1  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.994 , mean =  -0.02975801 , max =  0.998  
##  [32m A new explainer has been created! [39m

explainer_svm <- explain(titanic_svm,label = "svm",
                       data = valid_data,
                       y = valid_data$Survived)

## Preparation of a new explainer is initiated
##   -> model label       :  svm 
##   -> data              :  281  rows  9  cols 
##   -> target variable   :  281  values 
##   -> predict function  :  yhat.train  will be used ( [33m default [39m )
##   -> predicted values  :  No value for predict function target column. ( [33m default [39m )
##   -> model_info        :  package caret , ver. 6.0.86 , task classification ( [33m default [39m ) 
##   -> predicted values  :  numerical, min =  0.07143009 , mean =  0.4057753 , max =  0.909819  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.9068741 , mean =  -0.05346217 , max =  0.9195718  
##  [32m A new explainer has been created! [39m

explainer_gbm <- explain(titanic_gbm,label = "gbm",
                      data = valid_data,
                      y = valid_data$Survived)

## Preparation of a new explainer is initiated
##   -> model label       :  gbm 
##   -> data              :  281  rows  9  cols 
##   -> target variable   :  281  values 
##   -> predict function  :  yhat.train  will be used ( [33m default [39m )
##   -> predicted values  :  No value for predict function target column. ( [33m default [39m )
##   -> model_info        :  package caret , ver. 6.0.86 , task classification ( [33m default [39m ) 
##   -> predicted values  :  numerical, min =  0.006081133 , mean =  0.4167625 , max =  0.9957499  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.9955624 , mean =  -0.06444938 , max =  0.9833144  
##  [32m A new explainer has been created! [39m

# 使用model_performance 函數來查看模型表現
per_rf<-model_performance(explainer_rf)
per_svm<-model_performance(explainer_svm)
per_gbm<-model_performance(explainer_gbm)

累積殘差分佈圖

這個圖的正確解釋方法是，少數的樣本（離群點）貢獻的殘差（與真實值的偏差）。如果線在上面，那麼大量的樣本殘差都很大，此圖表明GBM模型大部分樣本的殘差都比較小，而神經網絡很多樣本的殘差都比基於樹模型的高。

# 對模型表現分佈進行可視化。
# 累積殘差分佈圖
plot(per_rf,per_svm,per_gbm)

變量的重要性

不過，這裡介紹一個新的參數設置，就是type = "difference"。在求變量重要性的時候，默認給出的是如果失去了這個變量，那麼模型會發生的實質數值變化。如果把這個參數設置為ratio，那麼就是缺失這個變量與缺失所有變量的變化之比，而設置為difference則為變化之差。這些參數能夠讓我們使用更加豐富的手段判斷一個變量的重要程度。

# 变量重要性分析
importance_rf<-variable_importance(
  explainer_rf,
  loss_function = loss_root_mean_square
)
importance_svm<-variable_importance(
  explainer_svm,
  loss_function = loss_root_mean_square
)
importance_gbm<-variable_importance(
  explainer_gbm,
  loss_function = loss_root_mean_square
)
plot(importance_rf,importance_svm,importance_gbm)

用模型的測及分評估

訓練好的三個model：titanic_rf; titanic_svm; titanic_gbm

# 分出valid的x和y
valid_feature <- valid_data %>%
  select(-c("Survived"))
valid_target <- valid_data[,"Survived"]
head(valid_feature)

##    Pclass    Sex  Age SibSp Parch    Fare Embarked Title
## 7       1   male 54.0     0     0 51.8625        S    Mr
## 12      1 female 58.0     0     0 26.5500        S  Miss
## 15      3 female 14.0     0     0  7.8542        S  Miss
## 22      2   male 34.0     0     0 13.0000        S    Mr
## 24      1   male 28.0     0     0 35.5000        S    Mr
## 27      3   male 28.5     0     0  7.2250        C    Mr

# 使用不同的已經訓練好的模型分類預測：
rf_probs = predict(titanic_rf,valid_feature,type = "prob") # 訓練集中的預測情形
svm_probs = predict(titanic_svm,valid_feature,type = "prob")
gbm_probs = predict(titanic_gbm,valid_feature,type = "prob")

隨機森林

#-------------- rf_probs 預測結果 -----------------
# 合併測試期預測結果與訓練期資料
set.seed(100)
rf_validDataPred <- cbind(valid_data, rf_probs)  

 # 將預測出來的分類機率欄位名稱從(0,1)改回(no,yes)
rf_validDataPred <- rf_validDataPred %>% 
  rename(no = `0`, yes = `1`)                                           
head(rf_validDataPred)

##    Pclass    Sex  Age SibSp Parch    Fare Embarked Title Survived    no   yes
## 7       1   male 54.0     0     0 51.8625        S    Mr        0 0.928 0.072
## 12      1 female 58.0     0     0 26.5500        S  Miss        1 0.214 0.786
## 15      3 female 14.0     0     0  7.8542        S  Miss        0 0.222 0.778
## 22      2   male 34.0     0     0 13.0000        S    Mr        1 0.950 0.050
## 24      1   male 28.0     0     0 35.5000        S    Mr        1 0.798 0.202
## 27      3   male 28.5     0     0  7.2250        C    Mr        0 0.978 0.022

# 查看訓練集的預測情形
rf_summaryvalid <- rf_validDataPred %>% 
  filter(yes > 0.5) %>%                           # yes 為預測存活下來的機率
  summarise(count = n(), 
            accuracy_rate = mean(Survived))              #  統計預測存活下來的人數與實際比較的準確率

rf_summaryvalid

##   count accuracy_rate
## 1   106     0.6981132

# count  accuracy_rate
# 106   0.6981132

# 查看測試集分類機率在不同門檻值下的預測情況
rf_summaryvalidCut <- rf_validDataPred %>% 
  # 看不同的分類機率做區間查看其準確率
  mutate(interval = cut(yes,breaks = c(0,0.5,0.55,0.6,0.65,0.7,0.8,1),include.lowest = T)) %>%  
  group_by(interval) %>%
  summarise(count = n(), 
            accuracy_rate = mean(Survived))

rf_summaryvalidCut

## # A tibble: 7 x 3
##   interval   count accuracy_rate
##   <fct>      <int>         <dbl>
## 1 [0,0.5]      175         0.143
## 2 (0.5,0.55]     3         0.667
## 3 (0.55,0.6]     4         0    
## 4 (0.6,0.65]     2         0    
## 5 (0.65,0.7]     6         0.333
## 6 (0.7,0.8]     14         0.5  
## 7 (0.8,1]       77         0.818

# plot
plot(rf_summaryvalidCut$interval,rf_summaryvalidCut$count)

建立混淆矩陣

# 建立混淆矩陣
# 將預測結果依照分類機率大於0.8當成1
rf_predictResult <- as.numeric(rf_validDataPred %>% as.data.frame() %>% select(yes) > 0.8) 
# 將預測結果與實際結果做成混淆矩陣
rf_confuseMatrix <- table(rf_validDataPred$Survived, rf_predictResult, dnn = c("實際","預測")) 
rf_confuseMatrix

##     預測
## 實際   0   1
##    0 168  14
##    1  36  63

#        預測
# 實際   0   1
#    0 168  14
#    1  36  63

# -------更加細緻的展現混淆矩陣
# 預測存活而實際有存活(TP, True Positive)
TP = rf_confuseMatrix[2,2]

# 預期有存活而實際沒有存活(FP, False Positive)
FP = rf_confuseMatrix[1,2]

# 預測沒有存活而實際有存活(FN, False Negative)
FN = rf_confuseMatrix[2,1]

# 預測沒有存活而實際沒有存活(TN, True Negative)
TN = rf_confuseMatrix[1,1]

# Precision, TP/(TP+FP)
precision <- TP/(TP+FP)

# Recall, TP/(TP+FN)
recall <- TP/(TP+FN)

# 整體準確率(accuracy), accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy <- (TP+TN)/(TP+FP+FN+TN)

# 計算F1 Score, F1 Score = 2TP/(2TP+FN+FP)
F1_score <- 2*TP/(2*TP+FN+FP)

rf_summaryTable <- data.frame(name=c("TP","FP","FN","TN","Precision","Recall","Accuracy","F1 Score"),
           rf_score=c(TP,FP,FN,TN,precision,recall,accuracy,F1_score))
rf_summaryTable

##        name    rf_score
## 1        TP  63.0000000
## 2        FP  14.0000000
## 3        FN  36.0000000
## 4        TN 168.0000000
## 5 Precision   0.8181818
## 6    Recall   0.6363636
## 7  Accuracy   0.8220641
## 8  F1 Score   0.7159091

RF的ROC

# ROC 的一種畫法
rf_ROC = roc(response = rf_validDataPred$Survived,
              predictor = rf_validDataPred$yes)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(rf_ROC,type = "S",col = "#800101",main=paste("ROC curve "),print.auc=TRUE, print.thres=TRUE)

SVM

#-------------- svm_probs 預測結果 -----------------
# 合併測試期預測結果與訓練期資料
svm_validDataPred <- cbind(valid_data, svm_probs)

# SVM 中的“Yes”預測的是死亡的機率，而“NO”預測的是存活的機率，與其他模型的類別正好相反，需要對類別的名稱做下調整方便後續處理
svm_validDataPred <- svm_validDataPred %>%
  rename(no = yes, yes = no)
head(svm_validDataPred)

##    Pclass    Sex  Age SibSp Parch    Fare Embarked Title Survived        no
## 7       1   male 54.0     0     0 51.8625        S    Mr        0 0.8765977
## 12      1 female 58.0     0     0 26.5500        S  Miss        1 0.2359398
## 15      3 female 14.0     0     0  7.8542        S  Miss        0 0.1275346
## 22      2   male 34.0     0     0 13.0000        S    Mr        1 0.8836847
## 24      1   male 28.0     0     0 35.5000        S    Mr        1 0.8216198
## 27      3   male 28.5     0     0  7.2250        C    Mr        0 0.8843287
##          yes
## 7  0.1234023
## 12 0.7640602
## 15 0.8724654
## 22 0.1163153
## 24 0.1783802
## 27 0.1156713

# 查看訓練集的預測情形
svm_summaryvalid <- svm_validDataPred %>% 
  filter(yes > 0.5) %>%                           # yes 為預測存活下來的機率
  summarise(count = n(), 
            accuracy_rate = mean(Survived))              #  統計預測存活下來的人數與實際比較的準確率

svm_summaryvalid

##   count accuracy_rate
## 1   109     0.6788991

# count  accuracy_rate
# 109      0.6788991

# 查看測試集分類機率在不同門檻值下的預測情況
svm_summaryvalidCut <- svm_validDataPred %>% 
  # 看不同的分類機率做區間查看其準確率
  mutate(interval = cut(yes,breaks = c(0,0.5,0.55,0.6,0.65,0.8,1),include.lowest = T)) %>%  
  group_by(interval) %>%
  summarise(count = n(), 
            accuracy_rate = mean(Survived))

svm_summaryvalidCut

## # A tibble: 6 x 3
##   interval   count accuracy_rate
##   <fct>      <int>         <dbl>
## 1 [0,0.5]      172         0.145
## 2 (0.5,0.55]     4         0.25 
## 3 (0.55,0.6]     3         0.333
## 4 (0.6,0.65]     5         0.6  
## 5 (0.65,0.8]    21         0.714
## 6 (0.8,1]       76         0.711

建立混淆矩陣

# 建立混淆矩陣
# 將預測結果依照分類機率大於0.8當成1
svm_predictResult <- as.numeric(svm_validDataPred %>% as.data.frame() %>% select(yes) > 0.8) 
# 將預測結果與實際結果做成混淆矩陣
svm_confuseMatrix <- table(svm_validDataPred$Survived, svm_predictResult, dnn = c("實際","預測")) 
svm_confuseMatrix

##     預測
## 實際   0   1
##    0 160  22
##    1  45  54

#        預測
# 實際   0   1
#    0 160  22
#    1  45  54

# -------更加細緻的展現混淆矩陣
# 預測存活而實際有存活(TP, True Positive)
TP = svm_confuseMatrix[2,2]

# 預期有存活而實際沒有存活(FP, False Positive)
FP = svm_confuseMatrix[1,2]

# 預測沒有存活而實際有存活(FN, False Negative)
FN = svm_confuseMatrix[2,1]

# 預測沒有存活而實際沒有存活(TN, True Negative)
TN = svm_confuseMatrix[1,1]

# Precision, TP/(TP+FP)
precision <- TP/(TP+FP)

# Recall, TP/(TP+FN)
recall <- TP/(TP+FN)

# 整體準確率(accuracy), accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy <- (TP+TN)/(TP+FP+FN+TN)

# 計算F1 Score, F1 Score = 2TP/(2TP+FN+FP)
F1_score <- 2*TP/(2*TP+FN+FP)

svm_summaryTable <- data.frame(name=c("TP","FP","FN","TN","Precision","Recall","Accuracy","F1 Score"),
           svm_score=c(TP,FP,FN,TN,precision,recall,accuracy,F1_score))
svm_summaryTable

##        name   svm_score
## 1        TP  54.0000000
## 2        FP  22.0000000
## 3        FN  45.0000000
## 4        TN 160.0000000
## 5 Precision   0.7105263
## 6    Recall   0.5454545
## 7  Accuracy   0.7615658
## 8  F1 Score   0.6171429

SVM的ROC

# 生成 ROC 線
svm_ROC = roc(response = svm_validDataPred$Survived,
              predictor = svm_validDataPred$yes)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(svm_ROC,col="#01800C",main=paste("ROC curve "),print.auc=TRUE, print.thres=TRUE)

GBM

# 合併測試期預測結果與訓練期資料
gbm_validDataPred <- cbind(valid_data, gbm_probs)
head(gbm_validDataPred)

##    Pclass    Sex  Age SibSp Parch    Fare Embarked Title Survived         0
## 7       1   male 54.0     0     0 51.8625        S    Mr        0 0.8932757
## 12      1 female 58.0     0     0 26.5500        S  Miss        1 0.3007826
## 15      3 female 14.0     0     0  7.8542        S  Miss        0 0.1294000
## 22      2   male 34.0     0     0 13.0000        S    Mr        1 0.8072050
## 24      1   male 28.0     0     0 35.5000        S    Mr        1 0.8289675
## 27      3   male 28.5     0     0  7.2250        C    Mr        0 0.9482286
##             1
## 7  0.10672434
## 12 0.69921742
## 15 0.87060005
## 22 0.19279501
## 24 0.17103245
## 27 0.05177144

# 將預測出來的分類機率欄位名稱從(0,1)改回(no,yes)
gbm_validDataPred <- gbm_validDataPred %>% 
  rename(no = `0`, yes = `1`)                                           
head(gbm_validDataPred)

##    Pclass    Sex  Age SibSp Parch    Fare Embarked Title Survived        no
## 7       1   male 54.0     0     0 51.8625        S    Mr        0 0.8932757
## 12      1 female 58.0     0     0 26.5500        S  Miss        1 0.3007826
## 15      3 female 14.0     0     0  7.8542        S  Miss        0 0.1294000
## 22      2   male 34.0     0     0 13.0000        S    Mr        1 0.8072050
## 24      1   male 28.0     0     0 35.5000        S    Mr        1 0.8289675
## 27      3   male 28.5     0     0  7.2250        C    Mr        0 0.9482286
##           yes
## 7  0.10672434
## 12 0.69921742
## 15 0.87060005
## 22 0.19279501
## 24 0.17103245
## 27 0.05177144

# 查看訓練集的預測情形
gbm_summaryvalid <- gbm_validDataPred %>% 
  filter(yes > 0.5) %>%                           # yes 為預測存活下來的機率
  summarise(count = n(), 
            accuracy_rate = mean(Survived))              #  統計預測存活下來的人數與實際比較的準確率

gbm_summaryvalid

##   count accuracy_rate
## 1   109     0.6788991

# count  accuracy_rate
# 109   0.6788991

# 查看測試集分類機率在不同門檻值下的預測情況
gbm_summaryvalidCut <- gbm_validDataPred %>% 
  # 看不同的分類機率做區間查看其準確率
  mutate(interval = cut(yes,breaks = c(0,0.5,0.55,0.6,0.65,0.8,1),include.lowest = T)) %>%  
  group_by(interval) %>%
  summarise(count = n(), 
            accuracy_rate = mean(Survived))

gbm_summaryvalidCut

## # A tibble: 6 x 3
##   interval   count accuracy_rate
##   <fct>      <int>         <dbl>
## 1 [0,0.5]      172         0.145
## 2 (0.5,0.55]     4         0    
## 3 (0.55,0.6]     6         0.667
## 4 (0.6,0.65]     6         0.333
## 5 (0.65,0.8]    14         0.357
## 6 (0.8,1]       79         0.797

建立混淆矩陣

# 建立混淆矩陣
# 將預測結果依照分類機率大於0.8當成1
gbm_predictResult <- as.numeric(gbm_validDataPred %>% as.data.frame() %>% select(yes) > 0.8) 
# 將預測結果與實際結果做成混淆矩陣
gbm_confuseMatrix <- table(gbm_validDataPred$Survived, gbm_predictResult, dnn = c("實際","預測")) 
gbm_confuseMatrix

##     預測
## 實際   0   1
##    0 166  16
##    1  36  63

#       預測
# 實際   0   1
#    0 166  16
#    1  36  63

# -------更加細緻的展現混淆矩陣-------
# 預測存活而實際有存活(TP, True Positive)
TP = gbm_confuseMatrix[2,2]

# 預期有存活而實際沒有存活(FP, False Positive)
FP = gbm_confuseMatrix[1,2]

# 預測沒有存活而實際有存活(FN, False Negative)
FN = gbm_confuseMatrix[2,1]

# 預測沒有存活而實際沒有存活(TN, True Negative)
TN = gbm_confuseMatrix[1,1]

# Precision, TP/(TP+FP)
precision <- TP/(TP+FP)

# Recall, TP/(TP+FN)
recall <- TP/(TP+FN)

# 整體準確率(accuracy), accuracy = (TP+TN)/(TP+FP+FN+TN)
accuracy <- (TP+TN)/(TP+FP+FN+TN)

# 計算F1 Score, F1 Score = 2TP/(2TP+FN+FP)
F1_score <- 2*TP/(2*TP+FN+FP)

gbm_summaryTable <- data.frame(name=c("TP","FP","FN","TN","Precision","Recall","Accuracy","F1 Score"),
           gbm_score=c(TP,FP,FN,TN,precision,recall,accuracy,F1_score))
gbm_summaryTable

##        name   gbm_score
## 1        TP  63.0000000
## 2        FP  16.0000000
## 3        FN  36.0000000
## 4        TN 166.0000000
## 5 Precision   0.7974684
## 6    Recall   0.6363636
## 7  Accuracy   0.8149466
## 8  F1 Score   0.7078652

GBM的ROC

# 生成 ROC 線
gbm_ROC = roc(response = gbm_validDataPred$Survived,
              predictor = gbm_validDataPred$yes)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(gbm_ROC,col = "#0043B0",print.auc=TRUE, main=paste("ROC curve "), print.thres=TRUE)

三個model的混淆矩陣比較

# 把各個model的score合併到一起
Table1 <- left_join(rf_summaryTable, svm_summaryTable, by="name")
allSummaryTable <- left_join(Table1, gbm_summaryTable,by="name")
allSummaryTable

##        name    rf_score   svm_score   gbm_score
## 1        TP  63.0000000  54.0000000  63.0000000
## 2        FP  14.0000000  22.0000000  16.0000000
## 3        FN  36.0000000  45.0000000  36.0000000
## 4        TN 168.0000000 160.0000000 166.0000000
## 5 Precision   0.8181818   0.7105263   0.7974684
## 6    Recall   0.6363636   0.5454545   0.6363636
## 7  Accuracy   0.8220641   0.7615658   0.8149466
## 8  F1 Score   0.7159091   0.6171429   0.7078652

三個model的ROC和AUC比較

plot(rf_ROC,
     print.auc=TRUE, print.auc.x=0.4, print.auc.y=0.5,
# 图像上输出AUC值,坐标为(x，y)
     auc.polygon=TRUE, auc.polygon.col="#fff7f7", # 设置ROC曲线下填充色
     max.auc.polygon=FALSE,  # 填充整个图像
     grid=c(0.5, 0.2), grid.col=c("black", "black"),  # 设置间距为0.1，0.2，线条颜色
     print.thres=TRUE, print.thres.cex=0.7,print.thres.adj = c(1, -1),# 图像上输出最佳截断值，字体缩放倍数，字的位置
     smooth=F, # 绘制不平滑曲线
     main="Comparison of three ROC curves", # 添加标题
     col="#800101",  # 曲线紅色
     legacy.axes=TRUE)   # 使横轴从0到1，表示为1-特异度

plot.roc(svm_ROC,
         add=T,  # 增加曲线
         col="#01800C", # 曲线颜色为綠色
         print.thres=TRUE, print.thres.cex=0.7,# 图像上输出最佳截断值，字体缩放倍数
         print.auc=TRUE, print.auc.x=0.4,print.auc.y=0.4,print.thres.adj = c(1.2, 1.2),
# 图像上输出AUC值,坐标为(x，y)
         smooth = F)  # 绘制不平滑曲线

plot.roc(gbm_ROC,
         add=T,  # 增加曲线
         col="#0043B0", # 曲线颜色为藍色
         print.thres=TRUE, print.thres.cex=0.7, print.thres.adj = c(-0.5, 1),# 图像上输出最佳截断值，字体缩放倍数
         print.auc=TRUE, print.auc.x=0.4,print.auc.y=0.3,
# 图像上输出AUC值,坐标为(x，y)
         smooth = F)  # 绘制不平滑曲线