[참고] https://www.kaggle.com/pyy0715/titanic-data-analysis-with-r
변수 설명
library(readr)
library(stringr)
library(ggplot2)
library(scales)
library(doBy)
library(RColorBrewer)
library(corrplot)
library(dplyr)
library(gridExtra)
library(tidyverse)
library(xgboost)
library(caret)
library(ROCR)
library(Metrics)
train.csv : 예측 모델을 만들기 위해 사용하는 학습 데이터다. 탑승객의 신상정보와 생존유무가 주어진다.
test.csv : 학습 데이터에서 신상정보 및 파생변수를 토대로 모델을 만들고 test.csv파일을 이용하여 생존유무를 예측한다.
sampleSubmission.csv : 제출시 사용하는 csv 파일이다.
## # A tibble: 891 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 0 3 Brau~ male 22 1 0 A/5 2~ 7.25
## 2 2 1 1 Cumi~ fema~ 38 1 0 PC 17~ 71.3
## 3 3 1 3 Heik~ fema~ 26 0 0 STON/~ 7.92
## 4 4 1 1 Futr~ fema~ 35 1 0 113803 53.1
## 5 5 0 3 Alle~ male 35 0 0 373450 8.05
## 6 6 0 3 Mora~ male NA 0 0 330877 8.46
## 7 7 0 1 McCa~ male 54 0 0 17463 51.9
## 8 8 0 3 Pals~ male 2 3 1 349909 21.1
## 9 9 1 3 John~ fema~ 27 0 2 347742 11.1
## 10 10 1 2 Nass~ fema~ 14 1 0 237736 30.1
## # ... with 881 more rows, and 2 more variables: Cabin <chr>,
## # Embarked <chr>
## # A tibble: 418 x 11
## PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 892 3 Kell~ male 34.5 0 0 330911 7.83 <NA>
## 2 893 3 Wilk~ fema~ 47 1 0 363272 7 <NA>
## 3 894 2 Myle~ male 62 0 0 240276 9.69 <NA>
## 4 895 3 Wirz~ male 27 0 0 315154 8.66 <NA>
## 5 896 3 Hirv~ fema~ 22 1 1 31012~ 12.3 <NA>
## 6 897 3 Sven~ male 14 0 0 7538 9.22 <NA>
## 7 898 3 Conn~ fema~ 30 0 0 330972 7.63 <NA>
## 8 899 2 Cald~ male 26 1 1 248738 29 <NA>
## 9 900 3 Abra~ fema~ 18 0 0 2657 7.23 <NA>
## 10 901 3 Davi~ male 21 2 0 A/4 4~ 24.2 <NA>
## # ... with 408 more rows, and 1 more variable: Embarked <chr>
## # A tibble: 1,309 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 0 3 Brau~ male 22 1 0 A/5 2~ 7.25
## 2 2 1 1 Cumi~ fema~ 38 1 0 PC 17~ 71.3
## 3 3 1 3 Heik~ fema~ 26 0 0 STON/~ 7.92
## 4 4 1 1 Futr~ fema~ 35 1 0 113803 53.1
## 5 5 0 3 Alle~ male 35 0 0 373450 8.05
## 6 6 0 3 Mora~ male NA 0 0 330877 8.46
## 7 7 0 1 McCa~ male 54 0 0 17463 51.9
## 8 8 0 3 Pals~ male 2 3 1 349909 21.1
## 9 9 1 3 John~ fema~ 27 0 2 347742 11.1
## 10 10 1 2 Nass~ fema~ 14 1 0 237736 30.1
## # ... with 1,299 more rows, and 2 more variables: Cabin <chr>,
## # Embarked <chr>
변수 형태 정의, 기본 요약
full <- full %>% # ticket과 cabin은 파생변수 생성을 위해 문자열로 놔둠
mutate(Survived = factor(Survived),
Pclass = factor(Pclass, ordered = T),
Name = factor(Name),
Sex = factor(Sex),
Embarked = factor(Embarked))
str(full)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: num 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## PassengerId Survived Pclass Name
## Min. : 1 0 :549 1:323 Connolly, Miss. Kate : 2
## 1st Qu.: 328 1 :342 2:277 Kelly, Mr. James : 2
## Median : 655 NA's:418 3:709 Abbing, Mr. Anthony : 1
## Mean : 655 Abbott, Master. Eugene Joseph : 1
## 3rd Qu.: 982 Abbott, Mr. Rossmore Edward : 1
## Max. :1309 Abbott, Mrs. Stanton (Rosa Hunt): 1
## (Other) :1301
## Sex Age SibSp Parch
## female:466 Min. : 0.17 Min. :0.0000 Min. :0.000
## male :843 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Ticket Fare Cabin Embarked
## Length:1309 Min. : 0.000 Length:1309 C :270
## Class :character 1st Qu.: 7.896 Class :character Q :123
## Mode :character Median : 14.454 Mode :character S :914
## Mean : 33.295 NA's: 2
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
결측치 EDA_1
## PassengerId Survived Pclass Name Sex Age
## 891 2 3 891 2 89
## SibSp Parch Ticket Fare Cabin Embarked
## 7 7 681 248 148 4
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 1014 2
missing_values <- full %>% # 결측치 비율을 데이터프레임으로
summarize_all(funs(sum(is.na(.))/n())) # summarise_all() affects every variable
missing_values
## # A tibble: 1 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.319 0 0 0 0.201 0 0 0 7.64e-4
## # ... with 2 more variables: Cabin <dbl>, Embarked <dbl>
# tidyr::gather()함수를 이용하여 stack화 시킴 (설명변수들이 key로 지정한 변수에 나열되고, 결측값들이 value로 지정한 변수의 값으로)
missing_values <- tidyr::gather(missing_values,
key = "feature", value = "missing_pct")
missing_values
## # A tibble: 12 x 2
## feature missing_pct
## <chr> <dbl>
## 1 PassengerId 0
## 2 Survived 0.319
## 3 Pclass 0
## 4 Name 0
## 5 Sex 0
## 6 Age 0.201
## 7 SibSp 0
## 8 Parch 0
## 9 Ticket 0
## 10 Fare 0.000764
## 11 Cabin 0.775
## 12 Embarked 0.00153
missing_values %>%
ggplot(aes(x = reorder(feature, missing_pct), y = missing_pct)) + # 정렬을 위한 reorder() 축지정
geom_bar(stat = "identity", fill = "red") + # bar plot 그리기 stat = 'identity' 데이터프레임 값을 그대로 이용하여 그리라는 옵션
ggtitle("Rate of missing values in each features") +
theme(plot.title = element_text(face = "bold", # 글씨체
hjust = 0.5, # Horizon(가로비율) = 0.5
size = 15, color = "darkblue")) +
labs(x = "Feature names", y = "Rate") + # x,y축 제목 지정
coord_flip() # Plot의 x, y축 변환
결측치 EDA_2
# 결측값이 있는 변수로만 시각화
missing_values <- missing_values[missing_values$missing_pct > 0, ]
missing_values %>%
ggplot(aes(x = reorder(feature, missing_pct), y = missing_pct)) + # 정렬을 위한 reorder() 축지정
geom_bar(stat = "identity", fill = "red") + # bar plot 그리기 stat = 'identity' 데이터프레임 값을 그대로 이용하여 그리라는 옵션
ggtitle("Rate of missing values in each features") +
theme(plot.title = element_text(face = "bold", # 글씨체
hjust = 0.5, # Horizon(가로비율) = 0.5
size = 15, color = "darkblue")) +
labs(x = "Feature names", y = "Rate") + # x,y축 제목 지정
coord_flip() # Plot의 x, y축 변환
sex(성별) EDA
##
## female male
## 466 843
## Warning: Factor `Survived` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `Survived` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 3
## # Groups: Survived [3]
## Survived Sex freq
## <fct> <fct> <int>
## 1 0 female 81
## 2 0 male 468
## 3 1 female 233
## 4 1 male 109
## 5 <NA> female 152
## 6 <NA> male 266
##
## 0 1
## female 0.2579618 0.7420382
## male 0.8110919 0.1889081
# 성별 막대그래프
sex.p1 <- full %>%
group_by(Sex) %>%
summarize(N = n()) %>%
ggplot(aes(Sex, N)) +
geom_col() +
geom_text(aes(label = N), size = 5, vjust = 1.2, color = "#FFFFFF") +
ggtitle("Bar plot of Sex") +
labs(x = "Sex", y = "Count")
# 성별에 따른 생존률 막대그래프
sex.p2 <- full%>%
filter(!is.na(Survived)) %>%
ggplot(aes(factor(Sex), fill = factor(Survived))) +
geom_bar(position = "fill") + #"dodge"
scale_y_continuous(labels = percent) +
scale_fill_brewer(palette = "Set1") + # palette에 어떤색 넣을지 지정
# 일정한 간격으로 x축과 y축 설정 : scale_x_continuous(breaks=seq())
# 분석가 마음대로 x축과 y축 설정 : scale_x_continuous(breaks=c())
ggtitle("Survival Rate by Sex") +
labs(x = "Sex", y = "Rate")
grid.arrange(sex.p1,sex.p2,ncol=2)
Pclass(티켓 클래스) EDA
##
## 1 2 3
## 323 277 709
##
## 0 1
## 1 0.3703704 0.6296296
## 2 0.5271739 0.4728261
## 3 0.7576375 0.2423625
# Pclass 막대그래프
pclass.p1 <- full %>%
group_by(Pclass) %>%
summarize(N = n()) %>%
ggplot(aes(Pclass, N)) +
geom_col() +
geom_text(aes(label = N), size = 5, vjust = 1.2, color = "#FFFFFF") +
ggtitle("Bar plot of Pclass") +
labs(x = "Pclass", y = "Count")
# Pclass에 따른 생존률 막대그래프
pclass.p2 <- full%>%
filter(!is.na(Survived)) %>%
ggplot(aes(factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "fill") +
scale_fill_brewer(palette = "Set1") +
ggtitle("Survival Rate by Pclass") +
labs(x = "Pclass", y = "Rate")
grid.arrange(pclass.p1,pclass.p2,ncol=2)
fare(탑승요금) EDA
# fare(탑승요금) 히스토그램
Fare.p1 <- full %>%
ggplot(aes(Fare)) +
geom_histogram(col = "yellow",
fill = "blue",
alpha = .5) +
ggtitle("Histogram of passengers Fare") +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15))
# 생존여부에 따른 fare box plot
Fare.p2 <- full %>%
filter(!is.na(Survived)) %>%
ggplot(aes(Survived, Fare)) + # x축에 생존 y축에 fare
# 관측치를 회색점으로 찍되, 중복되는 부분은 퍼지게 그려줍니다.
geom_jitter(col = "gray") +
# 상자그림 : 투명도 50%
geom_boxplot(alpha = .5) +
ggtitle("Boxplot of passengers Fare") +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15))
grid.arrange(Fare.p1, Fare.p2, ncol=2)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
age(나이) EDA
# 나이 분포 히스토그램
age.p1 <- full %>%
ggplot(aes(Age)) + # x값에 따른 y값을 그리는 것이 아니므로 축 지정 안해줘도 됨
# 히스토그램 그리기, 설정
geom_histogram(breaks = seq(0, 80, by = 1), # 간격 설정
col = "red", # 막대 경계선 색깔
fill = "green", # 막대 내부 색깔
alpha = .5) + # 막대 투명도 = 50%
# Plot title
ggtitle("All Titanic passengers age hitogram") +
theme(plot.title = element_text(face = "bold", # 글씨체
hjust = 0.5, # Horizon(가로비율) = 0.5
size = 15, color = "darkblue"))
# 나이에 따른 생존 분포 파악
age.p2 <- full %>%
filter(!is.na(Survived)) %>%
ggplot(aes(Age, fill = Survived)) +
geom_density(alpha = .5) + # 막대그래프가 아니고 밀도그래프니까 plot으로 축 지정하고 geom_bar 대신에 geom_density
ggtitle("Titanic passengers age density plot") +
theme(plot.title = element_text(face = "bold", hjust = 0.5,
size = 15, color = "darkblue"))
grid.arrange(age.p1,age.p2,ncol=2)
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## Warning: Removed 177 rows containing non-finite values (stat_density).
sibsp(함께 탑승한 형제자매, 배우자 수 총합) EDA
##
## 0 1 2 3 4 5 8
## 891 319 42 20 22 6 9
## # A tibble: 12 x 3
## # Groups: Survived [2]
## Survived SibSp freq
## <dbl> <dbl> <int>
## 1 0 0 398
## 2 0 1 97
## 3 0 2 15
## 4 0 3 12
## 5 0 4 15
## 6 0 5 5
## 7 0 8 7
## 8 1 0 210
## 9 1 1 112
## 10 1 2 13
## 11 1 3 4
## 12 1 4 3
##
## 0 1
## 0 0.6546053 0.3453947
## 1 0.4641148 0.5358852
## 2 0.5357143 0.4642857
## 3 0.7500000 0.2500000
## 4 0.8333333 0.1666667
## 5 1.0000000 0.0000000
## 8 1.0000000 0.0000000
parch (함께 탑승한 부모, 자녀 수 총합) EDA
##
## 0 1 2 3 4 5 6
## 678 118 80 5 4 5 1
## # A tibble: 12 x 3
## # Groups: Survived [2]
## Survived Parch freq
## <dbl> <dbl> <int>
## 1 0 0 445
## 2 0 1 53
## 3 0 2 40
## 4 0 3 2
## 5 0 4 4
## 6 0 5 4
## 7 0 6 1
## 8 1 0 233
## 9 1 1 65
## 10 1 2 40
## 11 1 3 3
## 12 1 5 1
##
## 0 1
## 0 0.6563422 0.3436578
## 1 0.4491525 0.5508475
## 2 0.5000000 0.5000000
## 3 0.4000000 0.6000000
## 4 1.0000000 0.0000000
## 5 0.8000000 0.2000000
## 6 1.0000000 0.0000000
Embarked(탑승 항구) EDA
##
## C Q S
## 168 77 644
## # A tibble: 7 x 3
## # Groups: Survived [2]
## Survived Embarked freq
## <dbl> <chr> <int>
## 1 0 C 75
## 2 0 Q 47
## 3 0 S 427
## 4 1 C 93
## 5 1 Q 30
## 6 1 S 217
## 7 1 <NA> 2
##
## 0 1
## C 0.4464286 0.5535714
## Q 0.6103896 0.3896104
## S 0.6630435 0.3369565
결측치 처리
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 1014 2
## # A tibble: 2 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 62 1 1 Icar~ fema~ 38 0 0 113572 80
## 2 830 1 1 Ston~ fema~ 62 0 0 113572 80
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>
embark_fare <- full[!is.na(full$Embarked), ] # A tibble: 1,307 x 12 embark_fare
ggplot(embark_fare, aes(x = Embarked, y = Fare, fill = factor(Pclass))) +
geom_boxplot() +
geom_hline(aes(yintercept=80), # fare가 80에 line 생성
colour='red', linetype='dashed', lwd=2) +
scale_y_continuous()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## # A tibble: 2 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 62 1 1 Icar~ fema~ 38 0 0 113572 80
## 2 830 1 1 Ston~ fema~ 62 0 0 113572 80
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>
## # A tibble: 1 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1044 <NA> 3 Stor~ male 60.5 0 0 3701 NA
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>
full$Fare[1044] <- median(full[full$Pclass == '3' & full$Embarked == 'S', ]$Fare, na.rm = TRUE) #중앙값으로 결측치 처리
full[1044,]
## # A tibble: 1 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1044 <NA> 3 Stor~ male 60.5 0 0 3701 8.05
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>
Feature engineering
Title <- full$Name
Title <- gsub("^.*, (.*?)\\..*$", "\\1", Title) # 정규표현식
full$Title <- Title
unique(full$Title)
## [1] "Mr" "Mrs" "Miss" "Master"
## [5] "Don" "Rev" "Dr" "Mme"
## [9] "Ms" "Major" "Lady" "Sir"
## [13] "Mlle" "Col" "Capt" "the Countess"
## [17] "Jonkheer" "Dona"
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
## | Capt | Col | Don | Dona |
## |--------------|--------------|--------------|--------------|
## | 1 | 4 | 1 | 1 |
## | 0.001 | 0.003 | 0.001 | 0.001 |
## |--------------|--------------|--------------|--------------|
##
## | Dr | Jonkheer | Lady | Major |
## |--------------|--------------|--------------|--------------|
## | 8 | 1 | 1 | 2 |
## | 0.006 | 0.001 | 0.001 | 0.002 |
## |--------------|--------------|--------------|--------------|
##
## | Master | Miss | Mlle | Mme |
## |--------------|--------------|--------------|--------------|
## | 61 | 260 | 2 | 1 |
## | 0.047 | 0.199 | 0.002 | 0.001 |
## |--------------|--------------|--------------|--------------|
##
## | Mr | Mrs | Ms | Rev |
## |--------------|--------------|--------------|--------------|
## | 757 | 197 | 2 | 8 |
## | 0.578 | 0.150 | 0.002 | 0.006 |
## |--------------|--------------|--------------|--------------|
##
## | Sir | the Countess |
## |--------------|--------------|
## | 1 | 1 |
## | 0.001 | 0.001 |
## |--------------|--------------|
# 5개 범주로 단순화 시키는 작업
full <- full %>%
# "%in%" 대신 "=="을 사용하게되면 Recyling Rule 때문에 원하는대로 되지 않습니다.
mutate(Title = ifelse(Title %in% c("Mlle", "Ms", "Lady", "Dona"), "Miss", Title), # %in% 개념
Title = ifelse(Title == "Mme", "Mrs", Title),
Title = ifelse(Title %in% c("Capt", "Col", "Major", "Dr", "Rev", "Don",
"Sir", "the Countess", "Jonkheer"), "Officer", Title),
Title = factor(Title))
# 파생변수 생성 후 각 범주별 빈도수, 비율 확인
descr::CrossTable(full$Title) # 5개의 범주로 축소
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
## | Master | Miss | Mr | Mrs | Officer |
## |---------|---------|---------|---------|---------|
## | 61 | 266 | 757 | 198 | 27 |
## | 0.047 | 0.203 | 0.578 | 0.151 | 0.021 |
## |---------|---------|---------|---------|---------|
# Sex 성별을 더미화한다
full$Sex <- ifelse(full$Sex == "male" ,0 , 1)
full$Sex <- as.factor(full$Sex)
#5.3 Fsize Sibsp와 Parch를 이용하여 Fsize 파생변수를 생성한다
full$Fsize <- full$SibSp + full$Parch + 1
table(full$Fsize)
##
## 1 2 3 4 5 6 7 8 11
## 790 235 159 43 22 25 16 8 11
# Fsize에 따른 생존율 시각화
Fsize.p1 <- full%>%
filter(!is.na(Survived)) %>%
ggplot(aes(Fsize, fill = Survived)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = percent) +
scale_x_continuous(breaks=c(1:11)) +
scale_fill_brewer(palette = "Set1") + # palette에 어떤색 넣을지 지정
# 일정한 간격으로 x축과 y축 설정 : scale_x_continuous(breaks=seq())
# 분석가 마음대로 x축과 y축 설정 : scale_x_continuous(breaks=c())
ggtitle("Survival Rate by Fsize") +
labs(x = "Fsize", y = "Rate")
Fsize.p1
## Warning: Unknown or uninitialised column: 'Familysize'.
full$Familysize[full$Fsize < 5 & full$Fsize > 1] <- 'small'
full$Familysize[full$Fsize > 4] <- 'large'
full$Familysize <- as.factor(full$Familysize)
table(full$Familysize)
##
## large single small
## 82 790 437
# 범주화 후 Familiysize에 따른 생존율 시각화
ggplot(full[1:891,], aes(x = Familysize, fill = Survived)) +
geom_bar(position = 'fill') +
ggtitle("Survival Rate by Familysize")
## $x
## [1] "Familysize"
##
## $y
## [1] "Rate"
##
## attr(,"class")
## [1] "labels"
## [1] NA "C85" NA "C123" NA
## [6] NA "E46" NA NA NA
## [11] "G6" "C103" NA NA NA
## [16] NA NA NA NA NA
## [21] NA "D56" NA "A6" NA
## [26] NA NA "C23 C25 C27"
## [1] "C" "8" "5"
#list 형식이여서 [[1]] 붙여 벡터로 출력
full$Deck <- factor(sapply(full$Cabin, function(x) strsplit(x, NULL)[[1]][1]))
full$Deck <- as.character(full$Deck)
str(full)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 16 variables:
## $ PassengerId: num 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ Title : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
## $ Fsize : num 2 2 1 2 1 1 1 5 3 2 ...
## $ Familysize : Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
## $ Deck : chr NA "C" NA "C" ...
## # A tibble: 6 x 15
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 0 3 Brau~ 0 22 1 0 A/5 2~ 7.25
## 2 2 1 1 Cumi~ 1 38 1 0 PC 17~ 71.3
## 3 3 1 3 Heik~ 1 26 0 0 STON/~ 7.92
## 4 4 1 1 Futr~ 1 35 1 0 113803 53.1
## 5 5 0 3 Alle~ 0 35 0 0 373450 8.05
## 6 6 0 3 Mora~ 0 NA 0 0 330877 8.46
## # ... with 5 more variables: Embarked <fct>, Title <fct>, Fsize <dbl>,
## # Familysize <fct>, Deck <chr>
full$Deck[is.na(full$Deck)] <- "U"
cabin <- full %>% filter(!is.na(full$Survived) & full$Deck != 'U')
ggplot(cabin, aes(
x = Deck,
fill = factor(Survived),
na.rm = TRUE
)) +
geom_bar(stat = 'count') +
facet_grid(. ~ Pclass) +
labs(title = "Survivor split by Pclass and Deck")
full <- full %>%
mutate(Deck = ifelse(
Pclass == 1 & Deck == "U",
"X",
ifelse(
Pclass == 2 & Deck == "U",
"Y",
ifelse(Pclass == 3 &
Deck == "U", "Z", Deck)
)
))
full %>% count(Deck)
## # A tibble: 11 x 2
## Deck n
## <chr> <int>
## 1 A 22
## 2 B 65
## 3 C 94
## 4 D 46
## 5 E 41
## 6 F 21
## 7 G 5
## 8 T 1
## 9 X 67
## 10 Y 254
## 11 Z 693
#
age.sex <- full %>% #Sex에 따른 Age 탐색
ggplot(aes(Age, fill = Sex)) +
geom_density(alpha = .5) +
ggtitle("Titanic passengers Age density plot") +
theme(plot.title = element_text(face = "bold", hjust = 0.5,
size = 15, color = "darkblue"))
age.sex
## Warning: Removed 263 rows containing non-finite values (stat_density).
age.pclass <- full %>% #Pclass에 따른 Age 탐색
ggplot(aes(Age, fill = Pclass)) +
geom_density(alpha = .5) +
ggtitle("Titanic passengers Age density plot") +
theme(plot.title = element_text(face = "bold", hjust = 0.5,
size = 15, color = "darkblue"))
age.pclass
## Warning: Removed 263 rows containing non-finite values (stat_density).
age.title <- full %>% #Title에 따른 Age 탐색
ggplot(aes(Age, fill = Title)) +
geom_density(alpha = .5) +
ggtitle("Titanic passengers Age density plot") +
theme(plot.title = element_text(face = "bold", hjust = 0.5,
size = 15, color = "darkblue"))
age.title
## Warning: Removed 263 rows containing non-finite values (stat_density).
# title별 Median Age를 통한 결측값 처리
full <- as.data.frame(full)
summaryBy(Age ~ Title, data=full, FUN=c(mean, sd, median), na.rm=TRUE) ## ddply, library(doBy)
## Title Age.mean Age.sd Age.median
## 1 Master 5.482642 4.161554 4
## 2 Miss 22.026000 12.300349 22
## 3 Mr 32.252151 12.422089 29
## 4 Mrs 36.918129 12.902087 35
## 5 Officer 45.307692 11.460434 48
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Master'), 4, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Miss'), 22, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Mr'), 29, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Mrs'), 35, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Officer'), 48, full$Age)
#Age 변수 가공
hist(full$Age, freq=F, main='Age',col='lightgreen', ylim=c(0,0.05))
# child : 18세 이하
# adult : 19세 이상 64세 이하
# senior : 65세 이상
full$Age <- ifelse(full$Age <= 18, "child",
ifelse(full$Age > 18 & full$Age <= 64, "adult","senior"))
## [1] 929
## [1] "A/5 21171" "PC 17599" "STON/O2. 3101282"
## [4] "113803" "373450" "330877"
## PassengerId Survived Pclass
## 1 258 1 1
## 2 505 1 1
## 3 760 1 1
## 4 263 0 1
## 5 559 1 1
## 6 586 1 1
## Name Sex Age SibSp
## 1 Cherry, Miss. Gladys 1 adult 0
## 2 Maioni, Miss. Roberta 1 child 0
## 3 Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards) 1 adult 0
## 4 Taussig, Mr. Emil 0 adult 1
## 5 Taussig, Mrs. Emil (Tillie Mandelbaum) 1 adult 1
## 6 Taussig, Miss. Ruth 1 child 0
## Parch Ticket Fare Embarked Title Fsize Familysize Deck
## 1 0 110152 86.50 S Miss 1 single B
## 2 0 110152 86.50 S Miss 1 single B
## 3 0 110152 86.50 S Officer 1 single B
## 4 1 110413 79.65 S Mr 3 small E
## 5 1 110413 79.65 S Mrs 3 small E
## 6 2 110413 79.65 S Miss 3 small E
full$TravelGroup <- NA
full <- (transform(full, TravelGroup = match(Ticket, unique(Ticket))))
head(full)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris 0 adult 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) 1 adult 1
## 3 Heikkinen, Miss. Laina 1 adult 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 adult 1
## 5 Allen, Mr. William Henry 0 adult 0
## 6 Moran, Mr. James 0 adult 0
## Parch Ticket Fare Embarked Title Fsize Familysize Deck
## 1 0 A/5 21171 7.2500 S Mr 2 small Z
## 2 0 PC 17599 71.2833 C Mrs 2 small C
## 3 0 STON/O2. 3101282 7.9250 S Miss 1 single Z
## 4 0 113803 53.1000 S Mrs 2 small C
## 5 0 373450 8.0500 S Mr 1 single Z
## 6 0 330877 8.4583 Q Mr 1 single Z
## TravelGroup
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
full <- full %>%
group_by(TravelGroup) %>%
mutate(GroupSize = n()) %>%
ungroup()
full %>% arrange(Ticket) %>% head()
## # A tibble: 6 x 17
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
## <dbl> <fct> <ord> <fct> <fct> <chr> <dbl> <dbl> <chr> <dbl>
## 1 258 1 1 Cher~ 1 adult 0 0 110152 86.5
## 2 505 1 1 Maio~ 1 child 0 0 110152 86.5
## 3 760 1 1 Roth~ 1 adult 0 0 110152 86.5
## 4 263 0 1 Taus~ 0 adult 1 1 110413 79.6
## 5 559 1 1 Taus~ 1 adult 1 1 110413 79.6
## 6 586 1 1 Taus~ 1 child 0 2 110413 79.6
## # ... with 7 more variables: Embarked <fct>, Title <fct>, Fsize <dbl>,
## # Familysize <fct>, Deck <chr>, TravelGroup <int>, GroupSize <int>
## Classes 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 17 variables:
## $ PassengerId: num 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Age : chr "adult" "adult" "adult" "adult" ...
## $ SibSp : num 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ Title : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
## $ Fsize : num 2 2 1 2 1 1 1 5 3 2 ...
## $ Familysize : Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
## $ Deck : chr "Z" "C" "Z" "C" ...
## $ TravelGroup: int 1 2 3 4 5 6 7 8 9 10 ...
## $ GroupSize : int 1 2 1 2 1 1 2 5 3 2 ...
#범주화 안된 변수들 범주화 처리
factor_vars <- c('Age','GroupSize','Deck')
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
#Fare log변환
full$Fare=log1p(full$Fare)
full <- full %>% select(-c(1,4,7,8,9,13,16))
str(full)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 10 variables:
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Age : Factor w/ 3 levels "adult","child",..: 1 1 1 1 1 1 1 2 1 2 ...
## $ Fare : num 2.11 4.28 2.19 3.99 2.2 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ Title : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
## $ Familysize: Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
## $ Deck : Factor w/ 11 levels "A","B","C","D",..: 11 3 11 3 11 11 5 11 11 10 ...
## $ GroupSize : Factor w/ 9 levels "1","2","3","4",..: 1 2 1 2 1 1 2 5 3 2 ...
train <- full %>% filter(is.na(Survived)==FALSE)
test <- full %>% filter(is.na(Survived)==TRUE)
train_label <- as.numeric(train$Survived)-1
test_label <- test$Survived
x_train <- model.matrix(~.-1, data = train[,-1]) %>% data.frame
x_test <- model.matrix(~.-1, data = test[,-1]) %>% data.frame
head(x_train)
## Pclass1 Pclass2 Pclass3 Sex1 Agechild Agesenior Fare EmbarkedQ
## 1 0 0 1 0 0 0 2.110213 0
## 2 1 0 0 1 0 0 4.280593 0
## 3 0 0 1 1 0 0 2.188856 0
## 4 1 0 0 1 0 0 3.990834 0
## 5 0 0 1 0 0 0 2.202765 0
## 6 0 0 1 0 0 0 2.246893 1
## EmbarkedS TitleMiss TitleMr TitleMrs TitleOfficer Familysizesingle
## 1 1 0 1 0 0 0
## 2 0 0 0 1 0 0
## 3 1 1 0 0 0 1
## 4 1 0 0 1 0 0
## 5 1 0 1 0 0 1
## 6 0 0 1 0 0 1
## Familysizesmall DeckB DeckC DeckD DeckE DeckF DeckG DeckT DeckX DeckY
## 1 1 0 0 0 0 0 0 0 0 0
## 2 1 0 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 1 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## DeckZ GroupSize2 GroupSize3 GroupSize4 GroupSize5 GroupSize6 GroupSize7
## 1 1 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0
## 3 1 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0
## 5 1 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0
## GroupSize8 GroupSize11
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Pclass1 Pclass2 Pclass3 Sex1 Agechild Agesenior Fare EmbarkedQ
## 1 0 0 1 0 0 0 2.178064 1
## 2 0 0 1 1 0 0 2.079442 0
## 3 0 1 0 0 0 0 2.369075 1
## 4 0 0 1 0 0 0 2.268252 0
## 5 0 0 1 1 0 0 2.586824 0
## 6 0 0 1 0 1 0 2.324836 0
## EmbarkedS TitleMiss TitleMr TitleMrs TitleOfficer Familysizesingle
## 1 0 0 1 0 0 1
## 2 1 0 0 1 0 0
## 3 0 0 1 0 0 1
## 4 1 0 1 0 0 1
## 5 1 0 0 1 0 0
## 6 1 0 1 0 0 1
## Familysizesmall DeckB DeckC DeckD DeckE DeckF DeckG DeckT DeckX DeckY
## 1 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## DeckZ GroupSize2 GroupSize3 GroupSize4 GroupSize5 GroupSize6 GroupSize7
## 1 1 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0
## 5 1 1 0 0 0 0 0
## 6 1 0 0 0 0 0 0
## GroupSize8 GroupSize11
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
# XGBOOST - cross vaildation
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label=train_label)
dtest <- xgb.DMatrix(data = as.matrix(x_test))
set.seed(2019)
param <- list(objective = "binary:logistic",
eval_metric = "auc",
max_depth = 6,
eta = 0.01,
gammma = 0,
subsamle = 0.5,
colsample_bytree = 0.5,
min_child_weight = 5)
xgb <- xgb.train(params = param,
data = dtrain,
nrounds = 4790,
silent = 1,
print_every_n = 100,
verbose = 0)
summary(xgb)
## Length Class Mode
## handle 1 xgb.Booster.handle externalptr
## raw 3882785 -none- raw
## niter 1 -none- numeric
## call 7 -none- call
## params 10 -none- list
## callbacks 0 -none- list
## feature_names 33 -none- character
## nfeatures 1 -none- numeric
set.seed(123)
split <- createDataPartition(y = train$Survived,p = 0.7,list = FALSE)
new_train <- train[split,]
new_test <- train[-split,]
x_label <- as.numeric(new_train$Survived)-1
y_label <- as.numeric(new_test$Survived)-1
new_train2 <- model.matrix(~.-1, data = new_train[,-1]) %>% data.frame
new_test2 <- model.matrix(~.-1, data = new_test[,-1]) %>% data.frame
dtrain2 <- xgb.DMatrix(data = as.matrix(new_train2), label=x_label)
dtest2 <- xgb.DMatrix(data = as.matrix(new_test2), label=y_label)
xgb2 <- xgb.train(params = param,
data = dtrain2,
nrounds = 4790,
silent = 1,
print_every_n = 100,
verbose = 0)
summary(xgb2)
## Length Class Mode
## handle 1 xgb.Booster.handle externalptr
## raw 3449489 -none- raw
## niter 1 -none- numeric
## call 7 -none- call
## params 10 -none- list
## callbacks 0 -none- list
## feature_names 33 -none- character
## nfeatures 1 -none- numeric
## [1] 0.52341890 0.79136348 0.88854092 0.05676261 0.05077031 0.27443644
## [7] 0.81016070 0.05935763 0.11204775 0.20620725
## [1] 1 1 1 0 0 0 1 0 1 1
## Levels: 0 1
pr <- prediction(XGB_pred2,new_test$Survived)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf)
## [1] 0.7938187