library(mice)
##
## 載入套件:'mice'
## 下列物件被遮斷自 'package:stats':
##
## filter
## 下列物件被遮斷自 'package:base':
##
## cbind, rbind
library(car)
## 載入需要的套件:carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 匯入資料,將空格取代為NA
# 顯示 train 的頭兩行,與 test 的末兩行
train <- read.csv("data/train.csv", na.strings=c(""," ","NA"))
head(train,2)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 <NA> S
## 2 PC 17599 71.2833 C85 C
test <- read.csv("data/test.csv", na.strings=c(""," ","NA"))
tail(test,2)
## PassengerId Pclass Name Sex Age SibSp Parch Ticket
## 417 1308 3 Ware, Mr. Frederick male NA 0 0 359309
## 418 1309 3 Peter, Master. Michael J male NA 1 1 2668
## Fare Cabin Embarked
## 417 8.0500 <NA> S
## 418 22.3583 <NA> C
# 刪除多餘列,並合併資料
train2 <- train[-2]
data <- rbind(train2, test)
head(data,2)
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris male
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 <NA> S
## 2 38 1 0 PC 17599 71.2833 C85 C
# 修改Cabin資料,改為適用的categorical format
data$Cabin<-substr(data$Cabin, 1,1)
data$Cabin<- as.factor(data$Cabin)
head(data,2)
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris male
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 <NA> S
## 2 38 1 0 PC 17599 71.2833 C C
# 繪製年齡直方圖
hist(data$Age, 40, col="gray", main="Histogram of Age before imputation")
# 利用 Multiple Imputation 填補數據缺失
# 利用 Random forest 填補 numeric, binary 和 ordered 變數
mice_mod <- mice(data[, c("Age", "Fare", "Cabin", "Embarked")], method='rf')
##
## iter imp variable
## 1 1 Age Fare Cabin
## 1 2 Age Fare Cabin
## 1 3 Age Fare Cabin
## 1 4 Age Fare Cabin
## 1 5 Age Fare Cabin
## 2 1 Age Fare Cabin
## 2 2 Age Fare Cabin
## 2 3 Age Fare Cabin
## 2 4 Age Fare Cabin
## 2 5 Age Fare Cabin
## 3 1 Age Fare Cabin
## 3 2 Age Fare Cabin
## 3 3 Age Fare Cabin
## 3 4 Age Fare Cabin
## 3 5 Age Fare Cabin
## 4 1 Age Fare Cabin
## 4 2 Age Fare Cabin
## 4 3 Age Fare Cabin
## 4 4 Age Fare Cabin
## 4 5 Age Fare Cabin
## 5 1 Age Fare Cabin
## 5 2 Age Fare Cabin
## 5 3 Age Fare Cabin
## 5 4 Age Fare Cabin
## 5 5 Age Fare Cabin
## Warning: Number of logged events: 1
mice_complete <- complete(mice_mod)
# 更替填補後各列
data$Age <- mice_complete$Age
data$Fare <- mice_complete$Fare
data$Cabin <- mice_complete$Cabin
data$Embarked <- mice_complete$Embarked
head(data,2)
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris male
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 F S
## 2 38 1 0 PC 17599 71.2833 C C
# 抽出所需數據,確保艙位等級為類別資料
data2 <- data[,c(1,2,3,4,5,6,7,9,10,11)]
data2$Pclass<- as.factor(data2$Pclass)
head(data2,2)
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris male
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## Age SibSp Parch Fare Cabin Embarked
## 1 22 1 0 7.2500 F S
## 2 38 1 0 71.2833 C C
qqnorm(data2$Fare, pch = 1, frame = FALSE)
qqline(data2$Fare, col = "steelblue", lwd = 2)
qqPlot(data2$Fare)
## [1] 259 680
可以看出數據在中間部分較符合常態分佈,右側明顯偏離(長尾),左側稍有偏離,但不如右側明顯。因為這筆數據是 Titanic 的票價,所以有少數價格及高的票價會影響分布的結果。
# 將乘客身分別分類
#or grepl grepl("Mrs. | Mme. | Mlle. | Countess.", data2$Name), 1, "")
data2$Name_f <- ifelse(str_detect(data2$Name, "Mrs. | Mme. | Mlle. | Countess."), 1,
ifelse(str_detect(data2$Name, "Mr. | Don. | Jonkheer."), 2,
ifelse(str_detect(data2$Name, "Master."), 3,
ifelse(str_detect(data2$Name, "Miss. | Ms."), 4,
ifelse(str_detect(data2$Name, "Rev."), 5,
ifelse(str_detect(data2$Name, "Dr."), 6,
ifelse(str_detect(data2$Name, "Col. | Major. | Capt."), 7, "")))))))
data2$Name_f<-as.factor(data2$Name_f)
# 乘客身分分布
# 無、男士、女士、男孩、女孩、牧師、博士、軍官
plot(data2$Name_f)