Titanic 分析

library(mice)
## 
## 載入套件:'mice'
## 下列物件被遮斷自 'package:stats':
## 
##     filter
## 下列物件被遮斷自 'package:base':
## 
##     cbind, rbind
library(car)
## 載入需要的套件:carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 匯入資料,將空格取代為NA
# 顯示 train 的頭兩行,與 test 的末兩行
train <- read.csv("data/train.csv", na.strings=c(""," ","NA"))
head(train,2)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
##      Ticket    Fare Cabin Embarked
## 1 A/5 21171  7.2500  <NA>        S
## 2  PC 17599 71.2833   C85        C
test <- read.csv("data/test.csv", na.strings=c(""," ","NA"))
tail(test,2)
##     PassengerId Pclass                     Name  Sex Age SibSp Parch Ticket
## 417        1308      3      Ware, Mr. Frederick male  NA     0     0 359309
## 418        1309      3 Peter, Master. Michael J male  NA     1     1   2668
##        Fare Cabin Embarked
## 417  8.0500  <NA>        S
## 418 22.3583  <NA>        C
# 刪除多餘列,並合併資料
train2 <- train[-2]
data <- rbind(train2, test)
head(data,2)
##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
##   Age SibSp Parch    Ticket    Fare Cabin Embarked
## 1  22     1     0 A/5 21171  7.2500  <NA>        S
## 2  38     1     0  PC 17599 71.2833   C85        C
# 修改Cabin資料,改為適用的categorical format
data$Cabin<-substr(data$Cabin, 1,1)
data$Cabin<- as.factor(data$Cabin)
head(data,2)
##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
##   Age SibSp Parch    Ticket    Fare Cabin Embarked
## 1  22     1     0 A/5 21171  7.2500  <NA>        S
## 2  38     1     0  PC 17599 71.2833     C        C
# 繪製年齡直方圖
hist(data$Age, 40, col="gray", main="Histogram of Age before imputation")

# 利用 Multiple Imputation 填補數據缺失
# 利用 Random forest 填補 numeric, binary 和 ordered 變數

mice_mod <- mice(data[, c("Age", "Fare", "Cabin", "Embarked")], method='rf')
## 
##  iter imp variable
##   1   1  Age  Fare  Cabin
##   1   2  Age  Fare  Cabin
##   1   3  Age  Fare  Cabin
##   1   4  Age  Fare  Cabin
##   1   5  Age  Fare  Cabin
##   2   1  Age  Fare  Cabin
##   2   2  Age  Fare  Cabin
##   2   3  Age  Fare  Cabin
##   2   4  Age  Fare  Cabin
##   2   5  Age  Fare  Cabin
##   3   1  Age  Fare  Cabin
##   3   2  Age  Fare  Cabin
##   3   3  Age  Fare  Cabin
##   3   4  Age  Fare  Cabin
##   3   5  Age  Fare  Cabin
##   4   1  Age  Fare  Cabin
##   4   2  Age  Fare  Cabin
##   4   3  Age  Fare  Cabin
##   4   4  Age  Fare  Cabin
##   4   5  Age  Fare  Cabin
##   5   1  Age  Fare  Cabin
##   5   2  Age  Fare  Cabin
##   5   3  Age  Fare  Cabin
##   5   4  Age  Fare  Cabin
##   5   5  Age  Fare  Cabin
## Warning: Number of logged events: 1
mice_complete <- complete(mice_mod)
# 更替填補後各列
data$Age <- mice_complete$Age 
data$Fare <- mice_complete$Fare 
data$Cabin <- mice_complete$Cabin
data$Embarked <- mice_complete$Embarked
head(data,2)
##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
##   Age SibSp Parch    Ticket    Fare Cabin Embarked
## 1  22     1     0 A/5 21171  7.2500     F        S
## 2  38     1     0  PC 17599 71.2833     C        C
# 抽出所需數據,確保艙位等級為類別資料
data2 <- data[,c(1,2,3,4,5,6,7,9,10,11)]
data2$Pclass<- as.factor(data2$Pclass)
head(data2,2)
##   PassengerId Pclass                                                Name    Sex
## 1           1      3                             Braund, Mr. Owen Harris   male
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
##   Age SibSp Parch    Fare Cabin Embarked
## 1  22     1     0  7.2500     F        S
## 2  38     1     0 71.2833     C        C
qqnorm(data2$Fare, pch = 1, frame = FALSE)
qqline(data2$Fare, col = "steelblue", lwd = 2)

qqPlot(data2$Fare)

## [1] 259 680

可以看出數據在中間部分較符合常態分佈,右側明顯偏離(長尾),左側稍有偏離,但不如右側明顯。因為這筆數據是 Titanic 的票價,所以有少數價格及高的票價會影響分布的結果。

# 將乘客身分別分類
#or grepl grepl("Mrs. | Mme. | Mlle. | Countess.", data2$Name), 1, "") 
data2$Name_f <- ifelse(str_detect(data2$Name, "Mrs. | Mme. | Mlle. | Countess."), 1, 
                        ifelse(str_detect(data2$Name, "Mr. | Don. | Jonkheer."), 2, 
                            ifelse(str_detect(data2$Name, "Master."), 3, 
                                ifelse(str_detect(data2$Name, "Miss. | Ms."), 4,  
                                    ifelse(str_detect(data2$Name, "Rev."), 5, 
                                        ifelse(str_detect(data2$Name, "Dr."), 6, 
                                            ifelse(str_detect(data2$Name, "Col. | Major. | Capt."), 7, "")))))))
data2$Name_f<-as.factor(data2$Name_f)
# 乘客身分分布
# 無、男士、女士、男孩、女孩、牧師、博士、軍官
plot(data2$Name_f)