Analysis_practice

[참고] https://www.kaggle.com/pyy0715/titanic-data-analysis-with-r

변수 설명

survival : 생존유무, target 값. (0 = 사망, 1 = 생존)
pclass : 티켓 클래스. (1 = 1st, 2 = 2nd, 3 = 3rd)
sex : 성별
Age : 나이(세)
sibsp : 함께 탑승한 형제자매, 배우자 수 총합
parch : 함께 탑승한 부모, 자녀 수 총합
ticket : 티켓 넘버
fare : 탑승 요금
cabin : 객실 넘버
embarked : 탑승 항구

0.1 packages

library(readr)
library(stringr) 
library(ggplot2)
library(scales)
library(doBy)
library(RColorBrewer)
library(corrplot)
library(dplyr) 
library(gridExtra)
library(tidyverse)
library(xgboost)
library(caret)
library(ROCR) 
library(Metrics)

0.2 import data

train.csv : 예측 모델을 만들기 위해 사용하는 학습 데이터다. 탑승객의 신상정보와 생존유무가 주어진다.
test.csv : 학습 데이터에서 신상정보 및 파생변수를 토대로 모델을 만들고 test.csv파일을 이용하여 생존유무를 예측한다.
sampleSubmission.csv : 제출시 사용하는 csv 파일이다.

train <- read_csv('train.csv')
test <- read_csv('test.csv')
full <- bind_rows(train, test)
train

## # A tibble: 891 x 12
##    PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##          <dbl>    <dbl>  <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>  <dbl>
##  1           1        0      3 Brau~ male     22     1     0 A/5 2~  7.25
##  2           2        1      1 Cumi~ fema~    38     1     0 PC 17~ 71.3 
##  3           3        1      3 Heik~ fema~    26     0     0 STON/~  7.92
##  4           4        1      1 Futr~ fema~    35     1     0 113803 53.1 
##  5           5        0      3 Alle~ male     35     0     0 373450  8.05
##  6           6        0      3 Mora~ male     NA     0     0 330877  8.46
##  7           7        0      1 McCa~ male     54     0     0 17463  51.9 
##  8           8        0      3 Pals~ male      2     3     1 349909 21.1 
##  9           9        1      3 John~ fema~    27     0     2 347742 11.1 
## 10          10        1      2 Nass~ fema~    14     1     0 237736 30.1 
## # ... with 881 more rows, and 2 more variables: Cabin <chr>,
## #   Embarked <chr>

test

## # A tibble: 418 x 11
##    PassengerId Pclass Name  Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl>  <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1         892      3 Kell~ male   34.5     0     0 330911  7.83 <NA> 
##  2         893      3 Wilk~ fema~  47       1     0 363272  7    <NA> 
##  3         894      2 Myle~ male   62       0     0 240276  9.69 <NA> 
##  4         895      3 Wirz~ male   27       0     0 315154  8.66 <NA> 
##  5         896      3 Hirv~ fema~  22       1     1 31012~ 12.3  <NA> 
##  6         897      3 Sven~ male   14       0     0 7538    9.22 <NA> 
##  7         898      3 Conn~ fema~  30       0     0 330972  7.63 <NA> 
##  8         899      2 Cald~ male   26       1     1 248738 29    <NA> 
##  9         900      3 Abra~ fema~  18       0     0 2657    7.23 <NA> 
## 10         901      3 Davi~ male   21       2     0 A/4 4~ 24.2  <NA> 
## # ... with 408 more rows, and 1 more variable: Embarked <chr>

full

## # A tibble: 1,309 x 12
##    PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##          <dbl>    <dbl>  <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>  <dbl>
##  1           1        0      3 Brau~ male     22     1     0 A/5 2~  7.25
##  2           2        1      1 Cumi~ fema~    38     1     0 PC 17~ 71.3 
##  3           3        1      3 Heik~ fema~    26     0     0 STON/~  7.92
##  4           4        1      1 Futr~ fema~    35     1     0 113803 53.1 
##  5           5        0      3 Alle~ male     35     0     0 373450  8.05
##  6           6        0      3 Mora~ male     NA     0     0 330877  8.46
##  7           7        0      1 McCa~ male     54     0     0 17463  51.9 
##  8           8        0      3 Pals~ male      2     3     1 349909 21.1 
##  9           9        1      3 John~ fema~    27     0     2 347742 11.1 
## 10          10        1      2 Nass~ fema~    14     1     0 237736 30.1 
## # ... with 1,299 more rows, and 2 more variables: Cabin <chr>,
## #   Embarked <chr>

0.3 EDA

변수 형태 정의, 기본 요약

full <- full %>% # ticket과 cabin은 파생변수 생성을 위해 문자열로 놔둠
  mutate(Survived = factor(Survived),
         Pclass   = factor(Pclass, ordered = T),
         Name     = factor(Name),
         Sex      = factor(Sex),
         Embarked = factor(Embarked))

str(full)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of  12 variables:
##  $ PassengerId: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  NA "C85" NA "C123" ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...

summary(full)

##   PassengerId   Survived   Pclass                                Name     
##  Min.   :   1   0   :549   1:323   Connolly, Miss. Kate            :   2  
##  1st Qu.: 328   1   :342   2:277   Kelly, Mr. James                :   2  
##  Median : 655   NA's:418   3:709   Abbing, Mr. Anthony             :   1  
##  Mean   : 655                      Abbott, Master. Eugene Joseph   :   1  
##  3rd Qu.: 982                      Abbott, Mr. Rossmore Edward     :   1  
##  Max.   :1309                      Abbott, Mrs. Stanton (Rosa Hunt):   1  
##                                    (Other)                         :1301  
##      Sex           Age            SibSp            Parch      
##  female:466   Min.   : 0.17   Min.   :0.0000   Min.   :0.000  
##  male  :843   1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.000  
##               Median :28.00   Median :0.0000   Median :0.000  
##               Mean   :29.88   Mean   :0.4989   Mean   :0.385  
##               3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.000  
##               Max.   :80.00   Max.   :8.0000   Max.   :9.000  
##               NA's   :263                                     
##     Ticket               Fare            Cabin           Embarked  
##  Length:1309        Min.   :  0.000   Length:1309        C   :270  
##  Class :character   1st Qu.:  7.896   Class :character   Q   :123  
##  Mode  :character   Median : 14.454   Mode  :character   S   :914  
##                     Mean   : 33.295                      NA's:  2  
##                     3rd Qu.: 31.275                                
##                     Max.   :512.329                                
##                     NA's   :1

결측치 EDA_1

sapply(train, function(x) length(unique(x))) # unique level 개수

## PassengerId    Survived      Pclass        Name         Sex         Age 
##         891           2           3         891           2          89 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           7           7         681         248         148           4

colSums(is.na(full))     #변수별 na 개수,  colSums(!is.na(full))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1        1014           2

missing_values <- full %>%      # 결측치 비율을 데이터프레임으로
  summarize_all(funs(sum(is.na(.))/n()))   # summarise_all() affects every variable
missing_values

## # A tibble: 1 x 12
##   PassengerId Survived Pclass  Name   Sex   Age SibSp Parch Ticket    Fare
##         <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1           0    0.319      0     0     0 0.201     0     0      0 7.64e-4
## # ... with 2 more variables: Cabin <dbl>, Embarked <dbl>

# tidyr::gather()함수를 이용하여 stack화 시킴 (설명변수들이 key로 지정한 변수에 나열되고, 결측값들이 value로 지정한 변수의 값으로)
missing_values <- tidyr::gather(missing_values,
                                key = "feature", value = "missing_pct")
missing_values

## # A tibble: 12 x 2
##    feature     missing_pct
##    <chr>             <dbl>
##  1 PassengerId    0       
##  2 Survived       0.319   
##  3 Pclass         0       
##  4 Name           0       
##  5 Sex            0       
##  6 Age            0.201   
##  7 SibSp          0       
##  8 Parch          0       
##  9 Ticket         0       
## 10 Fare           0.000764
## 11 Cabin          0.775   
## 12 Embarked       0.00153

missing_values %>% 
  ggplot(aes(x = reorder(feature, missing_pct), y = missing_pct)) + # 정렬을 위한 reorder() 축지정
  geom_bar(stat = "identity", fill = "red") +  # bar plot 그리기 stat = 'identity' 데이터프레임 값을 그대로 이용하여 그리라는 옵션
  ggtitle("Rate of missing values in each features") +
  theme(plot.title = element_text(face = "bold",    # 글씨체 
                                  hjust = 0.5,      # Horizon(가로비율) = 0.5
                                  size = 15, color = "darkblue")) +
  labs(x = "Feature names", y = "Rate") +  # x,y축 제목 지정
  coord_flip() # Plot의 x, y축 변환

결측치 EDA_2

# 결측값이 있는 변수로만 시각화
missing_values <- missing_values[missing_values$missing_pct > 0, ]

missing_values %>% 
  ggplot(aes(x = reorder(feature, missing_pct), y = missing_pct)) + # 정렬을 위한 reorder() 축지정
  geom_bar(stat = "identity", fill = "red") +  # bar plot 그리기 stat = 'identity' 데이터프레임 값을 그대로 이용하여 그리라는 옵션
  ggtitle("Rate of missing values in each features") +
  theme(plot.title = element_text(face = "bold",    # 글씨체 
                                  hjust = 0.5,      # Horizon(가로비율) = 0.5
                                  size = 15, color = "darkblue")) +
  labs(x = "Feature names", y = "Rate") +  # x,y축 제목 지정
  coord_flip() # Plot의 x, y축 변환

sex(성별) EDA

table(full$Sex)

## 
## female   male 
##    466    843

full %>% group_by(Survived, Sex) %>% summarise(freq = n())

## Warning: Factor `Survived` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Survived` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## # A tibble: 6 x 3
## # Groups:   Survived [3]
##   Survived Sex     freq
##   <fct>    <fct>  <int>
## 1 0        female    81
## 2 0        male     468
## 3 1        female   233
## 4 1        male     109
## 5 <NA>     female   152
## 6 <NA>     male     266

prop.table(table(full$Sex, full$Survived),1) #여자들이 생존할 확률이 높음 # 1=행, 2=열

##         
##                  0         1
##   female 0.2579618 0.7420382
##   male   0.8110919 0.1889081

# 성별 막대그래프
sex.p1 <- full %>% 
  group_by(Sex) %>% 
  summarize(N = n()) %>% 
  ggplot(aes(Sex, N)) +
  geom_col() +
  geom_text(aes(label = N), size = 5, vjust = 1.2, color = "#FFFFFF") + 
  ggtitle("Bar plot of Sex") +
  labs(x = "Sex", y = "Count")

# 성별에 따른 생존률 막대그래프
sex.p2 <- full%>%
  filter(!is.na(Survived)) %>%
  ggplot(aes(factor(Sex), fill = factor(Survived))) +
  geom_bar(position = "fill") + #"dodge"
  scale_y_continuous(labels = percent) +
  scale_fill_brewer(palette = "Set1") +  # palette에 어떤색 넣을지 지정
  # 일정한 간격으로 x축과 y축 설정 : scale_x_continuous(breaks=seq())
  # 분석가 마음대로 x축과 y축 설정 : scale_x_continuous(breaks=c())
  ggtitle("Survival Rate by Sex") + 
  labs(x = "Sex", y = "Rate")

grid.arrange(sex.p1,sex.p2,ncol=2)

Pclass(티켓 클래스) EDA

table(full$Pclass)

## 
##   1   2   3 
## 323 277 709

prop.table(table(full$Pclass,full$Survived),1) # 더 좋은 객실(1등실) 생존할 확률이 높음

##    
##             0         1
##   1 0.3703704 0.6296296
##   2 0.5271739 0.4728261
##   3 0.7576375 0.2423625

# Pclass 막대그래프
pclass.p1 <- full %>% 
  group_by(Pclass) %>% 
  summarize(N = n()) %>% 
  ggplot(aes(Pclass, N)) +
  geom_col() +
  geom_text(aes(label = N), size = 5, vjust = 1.2, color = "#FFFFFF") + 
  ggtitle("Bar plot of Pclass") +
  labs(x = "Pclass", y = "Count")

# Pclass에 따른 생존률 막대그래프
pclass.p2 <- full%>%
  filter(!is.na(Survived)) %>%
  ggplot(aes(factor(Pclass), fill = factor(Survived))) +
  geom_bar(position = "fill") + 
  scale_fill_brewer(palette = "Set1") +  
  ggtitle("Survival Rate by Pclass") + 
  labs(x = "Pclass", y = "Rate")

grid.arrange(pclass.p1,pclass.p2,ncol=2)

fare(탑승요금) EDA

hist(full$Fare)

# fare(탑승요금) 히스토그램
Fare.p1 <- full %>%
  ggplot(aes(Fare)) + 
  geom_histogram(col    = "yellow",
                 fill   = "blue", 
                 alpha  = .5) +
  ggtitle("Histogram of passengers Fare") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15))

# 생존여부에 따른 fare box plot
Fare.p2 <- full %>%
  filter(!is.na(Survived)) %>% 
  ggplot(aes(Survived, Fare)) +  # x축에 생존 y축에 fare
  # 관측치를 회색점으로 찍되, 중복되는 부분은 퍼지게 그려줍니다.
  geom_jitter(col = "gray") + 
  # 상자그림 : 투명도 50% 
  geom_boxplot(alpha = .5) + 
  ggtitle("Boxplot of passengers Fare") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 15))

grid.arrange(Fare.p1, Fare.p2, ncol=2)

## Warning: Removed 1 rows containing non-finite values (stat_bin).

age(나이) EDA

hist(full$Age)

# 나이 분포 히스토그램
age.p1 <- full %>% 
  ggplot(aes(Age)) +     # x값에 따른 y값을 그리는 것이 아니므로 축 지정 안해줘도 됨 
  # 히스토그램 그리기, 설정
  geom_histogram(breaks = seq(0, 80, by = 1), # 간격 설정 
                 col    = "red",              # 막대 경계선 색깔 
                 fill   = "green",            # 막대 내부 색깔 
                 alpha  = .5) +               # 막대 투명도 = 50% 
  # Plot title
  ggtitle("All Titanic passengers age hitogram") +
  theme(plot.title = element_text(face = "bold",    # 글씨체 
                                  hjust = 0.5,      # Horizon(가로비율) = 0.5
                                  size = 15, color = "darkblue"))

# 나이에 따른 생존 분포 파악
age.p2 <- full %>% 
  filter(!is.na(Survived)) %>%
  ggplot(aes(Age, fill = Survived)) + 
  geom_density(alpha = .5) +   # 막대그래프가 아니고 밀도그래프니까 plot으로 축 지정하고 geom_bar 대신에 geom_density
  ggtitle("Titanic passengers age density plot") + 
  theme(plot.title = element_text(face = "bold", hjust = 0.5,
                                  size = 15, color = "darkblue"))

grid.arrange(age.p1,age.p2,ncol=2)

## Warning: Removed 263 rows containing non-finite values (stat_bin).

## Warning: Removed 177 rows containing non-finite values (stat_density).

sibsp(함께 탑승한 형제자매, 배우자 수 총합) EDA

table(full$SibSp)

## 
##   0   1   2   3   4   5   8 
## 891 319  42  20  22   6   9

train %>% 
  group_by(Survived, SibSp) %>% 
  summarise(freq = n())

## # A tibble: 12 x 3
## # Groups:   Survived [2]
##    Survived SibSp  freq
##       <dbl> <dbl> <int>
##  1        0     0   398
##  2        0     1    97
##  3        0     2    15
##  4        0     3    12
##  5        0     4    15
##  6        0     5     5
##  7        0     8     7
##  8        1     0   210
##  9        1     1   112
## 10        1     2    13
## 11        1     3     4
## 12        1     4     3

prop.table(table(train$SibSp,train$Survived), 1) #배우자,형제자매가 많을수록 생존률이 떨어짐

##    
##             0         1
##   0 0.6546053 0.3453947
##   1 0.4641148 0.5358852
##   2 0.5357143 0.4642857
##   3 0.7500000 0.2500000
##   4 0.8333333 0.1666667
##   5 1.0000000 0.0000000
##   8 1.0000000 0.0000000

parch (함께 탑승한 부모, 자녀 수 총합) EDA

table(train$Parch)

## 
##   0   1   2   3   4   5   6 
## 678 118  80   5   4   5   1

train %>% 
  group_by(Survived, Parch) %>% 
  summarise(freq = n())

## # A tibble: 12 x 3
## # Groups:   Survived [2]
##    Survived Parch  freq
##       <dbl> <dbl> <int>
##  1        0     0   445
##  2        0     1    53
##  3        0     2    40
##  4        0     3     2
##  5        0     4     4
##  6        0     5     4
##  7        0     6     1
##  8        1     0   233
##  9        1     1    65
## 10        1     2    40
## 11        1     3     3
## 12        1     5     1

prop.table(table(train$Parch,train$Survived),1) #부모와 자녀를 1~3명 정도 동승했을 경우 생존률이 높음

##    
##             0         1
##   0 0.6563422 0.3436578
##   1 0.4491525 0.5508475
##   2 0.5000000 0.5000000
##   3 0.4000000 0.6000000
##   4 1.0000000 0.0000000
##   5 0.8000000 0.2000000
##   6 1.0000000 0.0000000

Embarked(탑승 항구) EDA

table(train$Embarked) #결측값 2개

## 
##   C   Q   S 
## 168  77 644

train %>% 
  group_by(Survived, Embarked) %>% 
  summarise(freq = n())

## # A tibble: 7 x 3
## # Groups:   Survived [2]
##   Survived Embarked  freq
##      <dbl> <chr>    <int>
## 1        0 C           75
## 2        0 Q           47
## 3        0 S          427
## 4        1 C           93
## 5        1 Q           30
## 6        1 S          217
## 7        1 <NA>         2

prop.table(table(train$Embarked,train$Survived),1) # C에서 탑승한 인원들만 생존률이 더 높다

##    
##             0         1
##   C 0.4464286 0.5535714
##   Q 0.6103896 0.3896104
##   S 0.6630435 0.3369565

0.4 data mumming_1

결측치 처리

# 결측치 개수 
colSums(is.na(full))  # A tibble: 1,309 x 12 full

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1        1014           2

# Embarked 결측처리
full[is.na(full$Embarked), ] #두개의 관측치 모두 Fare가 80이고, Pclass가 1임

## # A tibble: 2 x 12
##   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1          62 1        1      Icar~ fema~    38     0     0 113572    80
## 2         830 1        1      Ston~ fema~    62     0     0 113572    80
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>

embark_fare <- full[!is.na(full$Embarked), ]  # A tibble: 1,307 x 12 embark_fare

ggplot(embark_fare, aes(x = Embarked, y = Fare, fill = factor(Pclass))) +
  geom_boxplot() +
  geom_hline(aes(yintercept=80), # fare가 80에 line 생성
             colour='red', linetype='dashed', lwd=2) +
  scale_y_continuous()

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

full$Embarked[c(62, 830)] <- 'C'
full[c(62, 830),]

## # A tibble: 2 x 12
##   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1          62 1        1      Icar~ fema~    38     0     0 113572    80
## 2         830 1        1      Ston~ fema~    62     0     0 113572    80
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>

# Fare 결측처리
full  %>% 
  filter(is.na(full$Fare)) #Pclasss가 3이고, Embarked는 S임

## # A tibble: 1 x 12
##   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1        1044 <NA>     3      Stor~ male   60.5     0     0 3701      NA
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>

full$Fare[1044] <- median(full[full$Pclass == '3' & full$Embarked == 'S', ]$Fare, na.rm = TRUE) #중앙값으로 결측치 처리
full[1044,]

## # A tibble: 1 x 12
##   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1        1044 <NA>     3      Stor~ male   60.5     0     0 3701    8.05
## # ... with 2 more variables: Cabin <chr>, Embarked <fct>

0.5 data mumming_2

Feature engineering

Title <- full$Name
Title <- gsub("^.*, (.*?)\\..*$", "\\1", Title) # 정규표현식
full$Title <- Title
unique(full$Title)

##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
##  [5] "Don"          "Rev"          "Dr"           "Mme"         
##  [9] "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess"
## [17] "Jonkheer"     "Dona"

descr::CrossTable(full$Title)

##    Cell Contents 
## |-------------------------|
## |                       N | 
## |           N / Row Total | 
## |-------------------------|
## 
## |         Capt |          Col |          Don |         Dona |
## |--------------|--------------|--------------|--------------|
## |            1 |            4 |            1 |            1 |
## |        0.001 |        0.003 |        0.001 |        0.001 |
## |--------------|--------------|--------------|--------------|
## 
## |           Dr |     Jonkheer |         Lady |        Major |
## |--------------|--------------|--------------|--------------|
## |            8 |            1 |            1 |            2 |
## |        0.006 |        0.001 |        0.001 |        0.002 |
## |--------------|--------------|--------------|--------------|
## 
## |       Master |         Miss |         Mlle |          Mme |
## |--------------|--------------|--------------|--------------|
## |           61 |          260 |            2 |            1 |
## |        0.047 |        0.199 |        0.002 |        0.001 |
## |--------------|--------------|--------------|--------------|
## 
## |           Mr |          Mrs |           Ms |          Rev |
## |--------------|--------------|--------------|--------------|
## |          757 |          197 |            2 |            8 |
## |        0.578 |        0.150 |        0.002 |        0.006 |
## |--------------|--------------|--------------|--------------|
## 
## |          Sir | the Countess |
## |--------------|--------------|
## |            1 |            1 |
## |        0.001 |        0.001 |
## |--------------|--------------|

# 5개 범주로 단순화 시키는 작업 
full <- full %>%
  # "%in%" 대신 "=="을 사용하게되면 Recyling Rule 때문에 원하는대로 되지 않습니다.
  mutate(Title = ifelse(Title %in% c("Mlle", "Ms", "Lady", "Dona"), "Miss", Title), # %in% 개념
         Title = ifelse(Title == "Mme", "Mrs", Title),
         Title = ifelse(Title %in% c("Capt", "Col", "Major", "Dr", "Rev", "Don",
                                     "Sir", "the Countess", "Jonkheer"), "Officer", Title),
         Title = factor(Title))
# 파생변수 생성 후 각 범주별 빈도수, 비율 확인 
descr::CrossTable(full$Title) # 5개의 범주로 축소

##    Cell Contents 
## |-------------------------|
## |                       N | 
## |           N / Row Total | 
## |-------------------------|
## 
## |  Master |    Miss |      Mr |     Mrs | Officer |
## |---------|---------|---------|---------|---------|
## |      61 |     266 |     757 |     198 |      27 |
## |   0.047 |   0.203 |   0.578 |   0.151 |   0.021 |
## |---------|---------|---------|---------|---------|

# Sex  성별을 더미화한다
full$Sex <- ifelse(full$Sex == "male" ,0 , 1)
full$Sex <- as.factor(full$Sex)

#5.3 Fsize  Sibsp와 Parch를 이용하여 Fsize 파생변수를 생성한다
full$Fsize <- full$SibSp + full$Parch + 1
table(full$Fsize)

## 
##   1   2   3   4   5   6   7   8  11 
## 790 235 159  43  22  25  16   8  11

# Fsize에 따른 생존율 시각화
Fsize.p1 <- full%>%
  filter(!is.na(Survived)) %>%
  ggplot(aes(Fsize, fill = Survived)) +
  geom_bar(position = "fill") + 
  scale_y_continuous(labels = percent) +
  scale_x_continuous(breaks=c(1:11)) +
  scale_fill_brewer(palette = "Set1") +  # palette에 어떤색 넣을지 지정
  # 일정한 간격으로 x축과 y축 설정 : scale_x_continuous(breaks=seq())
  # 분석가 마음대로 x축과 y축 설정 : scale_x_continuous(breaks=c())
  ggtitle("Survival Rate by Fsize") + 
  labs(x = "Fsize", y = "Rate")

Fsize.p1

# 범주화
full$Familysize[full$Fsize == 1] <- 'single'

## Warning: Unknown or uninitialised column: 'Familysize'.

full$Familysize[full$Fsize < 5 & full$Fsize > 1] <- 'small'
full$Familysize[full$Fsize > 4] <- 'large'

full$Familysize <- as.factor(full$Familysize)
table(full$Familysize)

## 
##  large single  small 
##     82    790    437

# 범주화 후 Familiysize에 따른 생존율 시각화
ggplot(full[1:891,], aes(x = Familysize, fill = Survived)) +
  geom_bar(position = 'fill') +
  ggtitle("Survival Rate by Familysize")

labs(x="Familysize", y="Rate")    # [1:891,] 뺀 경우

## $x
## [1] "Familysize"
## 
## $y
## [1] "Rate"
## 
## attr(,"class")
## [1] "labels"

# Cabin
full$Cabin[1:28]

##  [1] NA            "C85"         NA            "C123"        NA           
##  [6] NA            "E46"         NA            NA            NA           
## [11] "G6"          "C103"        NA            NA            NA           
## [16] NA            NA            NA            NA            NA           
## [21] NA            "D56"         NA            "A6"          NA           
## [26] NA            NA            "C23 C25 C27"

strsplit(full$Cabin[2], NULL)[[1]]   #NULL 그냥 구분, #8인경우 8을 기준으로 나눠라

## [1] "C" "8" "5"

                                     #list 형식이여서 [[1]] 붙여 벡터로 출력  

full$Deck <- factor(sapply(full$Cabin, function(x) strsplit(x, NULL)[[1]][1]))
full$Deck <- as.character(full$Deck)
str(full)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of  16 variables:
##  $ PassengerId: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
##  $ Sex        : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  NA "C85" NA "C123" ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ Title      : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
##  $ Fsize      : num  2 2 1 2 1 1 1 5 3 2 ...
##  $ Familysize : Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
##  $ Deck       : chr  NA "C" NA "C" ...

#Cabin 변수 제거
full <- full[, -11]
head(full)

## # A tibble: 6 x 15
##   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1           1 0        3      Brau~ 0        22     1     0 A/5 2~  7.25
## 2           2 1        1      Cumi~ 1        38     1     0 PC 17~ 71.3 
## 3           3 1        3      Heik~ 1        26     0     0 STON/~  7.92
## 4           4 1        1      Futr~ 1        35     1     0 113803 53.1 
## 5           5 0        3      Alle~ 0        35     0     0 373450  8.05
## 6           6 0        3      Mora~ 0        NA     0     0 330877  8.46
## # ... with 5 more variables: Embarked <fct>, Title <fct>, Fsize <dbl>,
## #   Familysize <fct>, Deck <chr>

full$Deck[is.na(full$Deck)] <- "U"

cabin <- full %>% filter(!is.na(full$Survived) & full$Deck != 'U')

ggplot(cabin, aes(
  x = Deck,
  fill = factor(Survived),
  na.rm = TRUE
)) +
  geom_bar(stat = 'count') +
  facet_grid(. ~ Pclass) +
  labs(title = "Survivor split by Pclass and Deck")

full <- full  %>%
  mutate(Deck = ifelse(
    Pclass == 1 & Deck == "U",
    "X",
    ifelse(
      Pclass == 2 & Deck == "U",
      "Y",
      ifelse(Pclass == 3 &
               Deck == "U", "Z", Deck)
    )
  ))

full  %>% count(Deck)

## # A tibble: 11 x 2
##    Deck      n
##    <chr> <int>
##  1 A        22
##  2 B        65
##  3 C        94
##  4 D        46
##  5 E        41
##  6 F        21
##  7 G         5
##  8 T         1
##  9 X        67
## 10 Y       254
## 11 Z       693

#  
age.sex <- full %>%  #Sex에 따른 Age 탐색
  ggplot(aes(Age, fill = Sex)) + 
  geom_density(alpha = .5) +  
  ggtitle("Titanic passengers Age density plot") + 
  theme(plot.title = element_text(face = "bold", hjust = 0.5,
                                  size = 15, color = "darkblue"))
age.sex

## Warning: Removed 263 rows containing non-finite values (stat_density).

age.pclass <- full %>%  #Pclass에 따른 Age 탐색
  ggplot(aes(Age, fill = Pclass)) + 
  geom_density(alpha = .5) +  
  ggtitle("Titanic passengers Age density plot") + 
  theme(plot.title = element_text(face = "bold", hjust = 0.5,
                                  size = 15, color = "darkblue"))
age.pclass

## Warning: Removed 263 rows containing non-finite values (stat_density).

age.title <- full %>%  #Title에 따른 Age 탐색
  ggplot(aes(Age, fill = Title)) + 
  geom_density(alpha = .5) +  
  ggtitle("Titanic passengers Age density plot") + 
  theme(plot.title = element_text(face = "bold", hjust = 0.5,
                                  size = 15, color = "darkblue"))
age.title

## Warning: Removed 263 rows containing non-finite values (stat_density).

plot(full$Title)

# title별 Median Age를 통한 결측값 처리
full <- as.data.frame(full)
summaryBy(Age ~ Title, data=full, FUN=c(mean, sd, median), na.rm=TRUE) ## ddply, library(doBy)

##     Title  Age.mean    Age.sd Age.median
## 1  Master  5.482642  4.161554          4
## 2    Miss 22.026000 12.300349         22
## 3      Mr 32.252151 12.422089         29
## 4     Mrs 36.918129 12.902087         35
## 5 Officer 45.307692 11.460434         48

full$Age <- ifelse((is.na(full$Age) & full$Title == 'Master'), 4, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Miss'), 22, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Mr'), 29, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Mrs'), 35, full$Age)
full$Age <- ifelse((is.na(full$Age) & full$Title == 'Officer'), 48, full$Age)

#Age 변수 가공
hist(full$Age, freq=F, main='Age',col='lightgreen', ylim=c(0,0.05))

# child : 18세 이하
# adult : 19세 이상 64세 이하
# senior : 65세 이상

full$Age <- ifelse(full$Age <= 18, "child",
                   ifelse(full$Age > 18 & full$Age <= 64, "adult","senior"))

# Ticket 변수를 이용하여 GroupSize 파생변수를 생성한다
length(unique(full$Ticket))

## [1] 929

head(full$Ticket)

## [1] "A/5 21171"        "PC 17599"         "STON/O2. 3101282"
## [4] "113803"           "373450"           "330877"

full  %>%  arrange(Ticket) %>% head() #같은 티켓인데도 불구하고 Family가 single, 친구등과 같이 온것으로 유추

##   PassengerId Survived Pclass
## 1         258        1      1
## 2         505        1      1
## 3         760        1      1
## 4         263        0      1
## 5         559        1      1
## 6         586        1      1
##                                                       Name Sex   Age SibSp
## 1                                     Cherry, Miss. Gladys   1 adult     0
## 2                                    Maioni, Miss. Roberta   1 child     0
## 3 Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)   1 adult     0
## 4                                        Taussig, Mr. Emil   0 adult     1
## 5                   Taussig, Mrs. Emil (Tillie Mandelbaum)   1 adult     1
## 6                                      Taussig, Miss. Ruth   1 child     0
##   Parch Ticket  Fare Embarked   Title Fsize Familysize Deck
## 1     0 110152 86.50        S    Miss     1     single    B
## 2     0 110152 86.50        S    Miss     1     single    B
## 3     0 110152 86.50        S Officer     1     single    B
## 4     1 110413 79.65        S      Mr     3      small    E
## 5     1 110413 79.65        S     Mrs     3      small    E
## 6     2 110413 79.65        S    Miss     3      small    E

full$TravelGroup <- NA
full <- (transform(full, TravelGroup = match(Ticket, unique(Ticket))))
head(full)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name Sex   Age SibSp
## 1                             Braund, Mr. Owen Harris   0 adult     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer)   1 adult     1
## 3                              Heikkinen, Miss. Laina   1 adult     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel)   1 adult     1
## 5                            Allen, Mr. William Henry   0 adult     0
## 6                                    Moran, Mr. James   0 adult     0
##   Parch           Ticket    Fare Embarked Title Fsize Familysize Deck
## 1     0        A/5 21171  7.2500        S    Mr     2      small    Z
## 2     0         PC 17599 71.2833        C   Mrs     2      small    C
## 3     0 STON/O2. 3101282  7.9250        S  Miss     1     single    Z
## 4     0           113803 53.1000        S   Mrs     2      small    C
## 5     0           373450  8.0500        S    Mr     1     single    Z
## 6     0           330877  8.4583        Q    Mr     1     single    Z
##   TravelGroup
## 1           1
## 2           2
## 3           3
## 4           4
## 5           5
## 6           6

full <- full %>% 
  group_by(TravelGroup) %>% 
  mutate(GroupSize = n()) %>%
  ungroup()

full  %>% arrange(Ticket)  %>% head()

## # A tibble: 6 x 17
##   PassengerId Survived Pclass Name  Sex   Age   SibSp Parch Ticket  Fare
##         <dbl> <fct>    <ord>  <fct> <fct> <chr> <dbl> <dbl> <chr>  <dbl>
## 1         258 1        1      Cher~ 1     adult     0     0 110152  86.5
## 2         505 1        1      Maio~ 1     child     0     0 110152  86.5
## 3         760 1        1      Roth~ 1     adult     0     0 110152  86.5
## 4         263 0        1      Taus~ 0     adult     1     1 110413  79.6
## 5         559 1        1      Taus~ 1     adult     1     1 110413  79.6
## 6         586 1        1      Taus~ 1     child     0     2 110413  79.6
## # ... with 7 more variables: Embarked <fct>, Title <fct>, Fsize <dbl>,
## #   Familysize <fct>, Deck <chr>, TravelGroup <int>, GroupSize <int>

0.6 prediction

# 변수선택    Pclass, Sex, Age, Fare, Embarked, Title, Fsize, GroupSize, Deck
str(full)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1309 obs. of  17 variables:
##  $ PassengerId: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
##  $ Sex        : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Age        : chr  "adult" "adult" "adult" "adult" ...
##  $ SibSp      : num  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ Title      : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
##  $ Fsize      : num  2 2 1 2 1 1 1 5 3 2 ...
##  $ Familysize : Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
##  $ Deck       : chr  "Z" "C" "Z" "C" ...
##  $ TravelGroup: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ GroupSize  : int  1 2 1 2 1 1 2 5 3 2 ...

#범주화 안된 변수들 범주화 처리
factor_vars <- c('Age','GroupSize','Deck')

full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))

#Fare log변환
full$Fare=log1p(full$Fare)

full <- full  %>%  select(-c(1,4,7,8,9,13,16))
str(full)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1309 obs. of  10 variables:
##  $ Survived  : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass    : Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Sex       : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Age       : Factor w/ 3 levels "adult","child",..: 1 1 1 1 1 1 1 2 1 2 ...
##  $ Fare      : num  2.11 4.28 2.19 3.99 2.2 ...
##  $ Embarked  : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ Title     : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 3 1 4 4 ...
##  $ Familysize: Factor w/ 3 levels "large","single",..: 3 3 2 3 2 2 2 1 3 3 ...
##  $ Deck      : Factor w/ 11 levels "A","B","C","D",..: 11 3 11 3 11 11 5 11 11 10 ...
##  $ GroupSize : Factor w/ 9 levels "1","2","3","4",..: 1 2 1 2 1 1 2 5 3 2 ...

train <- full  %>% filter(is.na(Survived)==FALSE)
test <- full  %>% filter(is.na(Survived)==TRUE)

train_label <- as.numeric(train$Survived)-1
test_label <- test$Survived

x_train <- model.matrix(~.-1, data = train[,-1]) %>% data.frame
x_test <- model.matrix(~.-1, data = test[,-1]) %>% data.frame
head(x_train)

##   Pclass1 Pclass2 Pclass3 Sex1 Agechild Agesenior     Fare EmbarkedQ
## 1       0       0       1    0        0         0 2.110213         0
## 2       1       0       0    1        0         0 4.280593         0
## 3       0       0       1    1        0         0 2.188856         0
## 4       1       0       0    1        0         0 3.990834         0
## 5       0       0       1    0        0         0 2.202765         0
## 6       0       0       1    0        0         0 2.246893         1
##   EmbarkedS TitleMiss TitleMr TitleMrs TitleOfficer Familysizesingle
## 1         1         0       1        0            0                0
## 2         0         0       0        1            0                0
## 3         1         1       0        0            0                1
## 4         1         0       0        1            0                0
## 5         1         0       1        0            0                1
## 6         0         0       1        0            0                1
##   Familysizesmall DeckB DeckC DeckD DeckE DeckF DeckG DeckT DeckX DeckY
## 1               1     0     0     0     0     0     0     0     0     0
## 2               1     0     1     0     0     0     0     0     0     0
## 3               0     0     0     0     0     0     0     0     0     0
## 4               1     0     1     0     0     0     0     0     0     0
## 5               0     0     0     0     0     0     0     0     0     0
## 6               0     0     0     0     0     0     0     0     0     0
##   DeckZ GroupSize2 GroupSize3 GroupSize4 GroupSize5 GroupSize6 GroupSize7
## 1     1          0          0          0          0          0          0
## 2     0          1          0          0          0          0          0
## 3     1          0          0          0          0          0          0
## 4     0          1          0          0          0          0          0
## 5     1          0          0          0          0          0          0
## 6     1          0          0          0          0          0          0
##   GroupSize8 GroupSize11
## 1          0           0
## 2          0           0
## 3          0           0
## 4          0           0
## 5          0           0
## 6          0           0

head(x_test)

##   Pclass1 Pclass2 Pclass3 Sex1 Agechild Agesenior     Fare EmbarkedQ
## 1       0       0       1    0        0         0 2.178064         1
## 2       0       0       1    1        0         0 2.079442         0
## 3       0       1       0    0        0         0 2.369075         1
## 4       0       0       1    0        0         0 2.268252         0
## 5       0       0       1    1        0         0 2.586824         0
## 6       0       0       1    0        1         0 2.324836         0
##   EmbarkedS TitleMiss TitleMr TitleMrs TitleOfficer Familysizesingle
## 1         0         0       1        0            0                1
## 2         1         0       0        1            0                0
## 3         0         0       1        0            0                1
## 4         1         0       1        0            0                1
## 5         1         0       0        1            0                0
## 6         1         0       1        0            0                1
##   Familysizesmall DeckB DeckC DeckD DeckE DeckF DeckG DeckT DeckX DeckY
## 1               0     0     0     0     0     0     0     0     0     0
## 2               1     0     0     0     0     0     0     0     0     0
## 3               0     0     0     0     0     0     0     0     0     1
## 4               0     0     0     0     0     0     0     0     0     0
## 5               1     0     0     0     0     0     0     0     0     0
## 6               0     0     0     0     0     0     0     0     0     0
##   DeckZ GroupSize2 GroupSize3 GroupSize4 GroupSize5 GroupSize6 GroupSize7
## 1     1          0          0          0          0          0          0
## 2     1          0          0          0          0          0          0
## 3     0          0          0          0          0          0          0
## 4     1          0          0          0          0          0          0
## 5     1          1          0          0          0          0          0
## 6     1          0          0          0          0          0          0
##   GroupSize8 GroupSize11
## 1          0           0
## 2          0           0
## 3          0           0
## 4          0           0
## 5          0           0
## 6          0           0

# XGBOOST  -  cross vaildation

dtrain <- xgb.DMatrix(data = as.matrix(x_train), label=train_label)
dtest <- xgb.DMatrix(data = as.matrix(x_test))

set.seed(2019)
param <- list(objective   = "binary:logistic",
              eval_metric = "auc",
              max_depth   = 6,
              eta         = 0.01,
              gammma      = 0,
              subsamle    = 0.5,
              colsample_bytree = 0.5,
              min_child_weight = 5)

xgb <- xgb.train(params  = param,
                 data    = dtrain,
                 nrounds = 4790,
                 silent = 1,
                 print_every_n = 100,
                 verbose = 0)
summary(xgb)

##               Length  Class              Mode       
## handle              1 xgb.Booster.handle externalptr
## raw           3882785 -none-             raw        
## niter               1 -none-             numeric    
## call                7 -none-             call       
## params             10 -none-             list       
## callbacks           0 -none-             list       
## feature_names      33 -none-             character  
## nfeatures           1 -none-             numeric

set.seed(123)
split <- createDataPartition(y = train$Survived,p = 0.7,list = FALSE)

new_train <- train[split,] 
new_test <- train[-split,]

x_label <- as.numeric(new_train$Survived)-1
y_label <- as.numeric(new_test$Survived)-1

new_train2 <- model.matrix(~.-1, data = new_train[,-1]) %>% data.frame
new_test2 <- model.matrix(~.-1, data = new_test[,-1]) %>% data.frame

dtrain2 <- xgb.DMatrix(data = as.matrix(new_train2), label=x_label)
dtest2 <- xgb.DMatrix(data = as.matrix(new_test2), label=y_label)

xgb2 <- xgb.train(params  = param,
                  data    = dtrain2,
                  nrounds = 4790,
                  silent = 1,
                  print_every_n = 100,
                  verbose = 0)
summary(xgb2)

##               Length  Class              Mode       
## handle              1 xgb.Booster.handle externalptr
## raw           3449489 -none-             raw        
## niter               1 -none-             numeric    
## call                7 -none-             call       
## params             10 -none-             list       
## callbacks           0 -none-             list       
## feature_names      33 -none-             character  
## nfeatures           1 -none-             numeric

set.seed(2019)
XGB_pred2 <- predict(xgb2, dtest2)

head(XGB_pred2,10)

##  [1] 0.52341890 0.79136348 0.88854092 0.05676261 0.05077031 0.27443644
##  [7] 0.81016070 0.05935763 0.11204775 0.20620725

head(new_test$Survived,10)

##  [1] 1 1 1 0 0 0 1 0 1 1
## Levels: 0 1

set.seed(2019)
XGB_pred2 <- predict(xgb2, dtest2)
XGB_pred2 <- ifelse(XGB_pred2>=0.5,1,0)

0.7 plot ROC

pr <- prediction(XGB_pred2,new_test$Survived)
perf <- performance(pr,measure = "tpr",x.measure = "fpr") 

plot(perf)

auc(new_test$Survived, XGB_pred2)

## [1] 0.7938187

0.8 submission

# Submission
set.seed(2019)
XGB_pred <- predict(xgb, dtest)
XGB_pred <- ifelse(XGB_pred>=0.4,1,0)

xgb.importance(colnames(dtrain), model = xgb) %>% 
  xgb.plot.importance(top_n = 30)

Analysis_practice

updragon

2019 10 29

0.1 packages

0.2 import data

0.3 EDA

0.4 data mumming_1

0.5 data mumming_2

0.6 prediction

0.7 plot ROC

0.8 submission