데이터 출처

Titanic: Machine Learning from Disaster, https://www.kaggle.com/c/titanic/data

0. require library

1. Data load

titanic<-read_csv("D:/R_File/Decision-Tree/titanic_data.csv")
## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Survived = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )
###2.데이터 탐색 및 필요 변수 생성 
glimpse(titanic) # 891개 Obs, 12개 변수
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex         <chr> "male", "female", "female", "female", "male", "mal...
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
titanic$Cabin_Derived<-ifelse(!(is.na(titanic$Cabin)), "Cabin","Passenger")# 승객, 승무원 구분을 위해 생성 

2.1.선실등급별 생존률: 1등실의 승객의 생존율이 2,3등실보다 높음, 선택

addmargins(prop.table(table(titanic$Pclass, titanic$Survived)))
##      
##                0          1        Sum
##   1   0.08978676 0.15263749 0.24242424
##   2   0.10886644 0.09764310 0.20650954
##   3   0.41750842 0.13355780 0.55106622
##   Sum 0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Pclass,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

2.2.성별 생존율: 여성이 남성보다 생존 비율이 높음, 선택

addmargins(prop.table(table(titanic$Sex, titanic$Survived)))
##         
##                   0          1        Sum
##   female 0.09090909 0.26150393 0.35241302
##   male   0.52525253 0.12233446 0.64758698
##   Sum    0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Sex,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

2.3.나이별 생존율:나이가 어리거나 젊은 사람이 생존율이 높음, 선택

addmargins(prop.table(table(titanic$Age, titanic$Survived)))
##       
##                  0           1         Sum
##   0.42 0.000000000 0.001400560 0.001400560
##   0.67 0.000000000 0.001400560 0.001400560
##   0.75 0.000000000 0.002801120 0.002801120
##   0.83 0.000000000 0.002801120 0.002801120
##   0.92 0.000000000 0.001400560 0.001400560
##   1    0.002801120 0.007002801 0.009803922
##   2    0.009803922 0.004201681 0.014005602
##   3    0.001400560 0.007002801 0.008403361
##   4    0.004201681 0.009803922 0.014005602
##   5    0.000000000 0.005602241 0.005602241
##   6    0.001400560 0.002801120 0.004201681
##   7    0.002801120 0.001400560 0.004201681
##   8    0.002801120 0.002801120 0.005602241
##   9    0.008403361 0.002801120 0.011204482
##   10   0.002801120 0.000000000 0.002801120
##   11   0.004201681 0.001400560 0.005602241
##   12   0.000000000 0.001400560 0.001400560
##   13   0.000000000 0.002801120 0.002801120
##   14   0.004201681 0.004201681 0.008403361
##   14.5 0.001400560 0.000000000 0.001400560
##   15   0.001400560 0.005602241 0.007002801
##   16   0.015406162 0.008403361 0.023809524
##   17   0.009803922 0.008403361 0.018207283
##   18   0.023809524 0.012605042 0.036414566
##   19   0.022408964 0.012605042 0.035014006
##   20   0.016806723 0.004201681 0.021008403
##   20.5 0.001400560 0.000000000 0.001400560
##   21   0.026610644 0.007002801 0.033613445
##   22   0.022408964 0.015406162 0.037815126
##   23   0.014005602 0.007002801 0.021008403
##   23.5 0.001400560 0.000000000 0.001400560
##   24   0.021008403 0.021008403 0.042016807
##   24.5 0.001400560 0.000000000 0.001400560
##   25   0.023809524 0.008403361 0.032212885
##   26   0.016806723 0.008403361 0.025210084
##   27   0.009803922 0.015406162 0.025210084
##   28   0.025210084 0.009803922 0.035014006
##   28.5 0.002801120 0.000000000 0.002801120
##   29   0.016806723 0.011204482 0.028011204
##   30   0.021008403 0.014005602 0.035014006
##   30.5 0.002801120 0.000000000 0.002801120
##   31   0.012605042 0.011204482 0.023809524
##   32   0.012605042 0.012605042 0.025210084
##   32.5 0.001400560 0.001400560 0.002801120
##   33   0.012605042 0.008403361 0.021008403
##   34   0.012605042 0.008403361 0.021008403
##   34.5 0.001400560 0.000000000 0.001400560
##   35   0.009803922 0.015406162 0.025210084
##   36   0.015406162 0.015406162 0.030812325
##   36.5 0.001400560 0.000000000 0.001400560
##   37   0.007002801 0.001400560 0.008403361
##   38   0.008403361 0.007002801 0.015406162
##   39   0.012605042 0.007002801 0.019607843
##   40   0.009803922 0.008403361 0.018207283
##   40.5 0.002801120 0.000000000 0.002801120
##   41   0.005602241 0.002801120 0.008403361
##   42   0.009803922 0.008403361 0.018207283
##   43   0.005602241 0.001400560 0.007002801
##   44   0.008403361 0.004201681 0.012605042
##   45   0.009803922 0.007002801 0.016806723
##   45.5 0.002801120 0.000000000 0.002801120
##   46   0.004201681 0.000000000 0.004201681
##   47   0.011204482 0.001400560 0.012605042
##   48   0.004201681 0.008403361 0.012605042
##   49   0.002801120 0.005602241 0.008403361
##   50   0.007002801 0.007002801 0.014005602
##   51   0.007002801 0.002801120 0.009803922
##   52   0.004201681 0.004201681 0.008403361
##   53   0.000000000 0.001400560 0.001400560
##   54   0.007002801 0.004201681 0.011204482
##   55   0.001400560 0.001400560 0.002801120
##   55.5 0.001400560 0.000000000 0.001400560
##   56   0.002801120 0.002801120 0.005602241
##   57   0.002801120 0.000000000 0.002801120
##   58   0.002801120 0.004201681 0.007002801
##   59   0.002801120 0.000000000 0.002801120
##   60   0.002801120 0.002801120 0.005602241
##   61   0.004201681 0.000000000 0.004201681
##   62   0.002801120 0.002801120 0.005602241
##   63   0.000000000 0.002801120 0.002801120
##   64   0.002801120 0.000000000 0.002801120
##   65   0.004201681 0.000000000 0.004201681
##   66   0.001400560 0.000000000 0.001400560
##   70   0.002801120 0.000000000 0.002801120
##   70.5 0.001400560 0.000000000 0.001400560
##   71   0.002801120 0.000000000 0.002801120
##   74   0.001400560 0.000000000 0.001400560
##   80   0.000000000 0.001400560 0.001400560
##   Sum  0.593837535 0.406162465 1.000000000
ggplot(titanic,aes(x=Age,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
## Warning: Removed 177 rows containing non-finite values (stat_count).
## Warning: position_stack requires non-overlapping x intervals

2.4.형제자매 동승자 생존율: 특이점 발견 못함, 미선택

addmargins(prop.table(table(titanic$Parch, titanic$Survived)))
##      
##                 0           1         Sum
##   0   0.499438833 0.261503928 0.760942761
##   1   0.059483726 0.072951740 0.132435466
##   2   0.044893378 0.044893378 0.089786756
##   3   0.002244669 0.003367003 0.005611672
##   4   0.004489338 0.000000000 0.004489338
##   5   0.004489338 0.001122334 0.005611672
##   6   0.001122334 0.000000000 0.001122334
##   Sum 0.616161616 0.383838384 1.000000000
ggplot(titanic,aes(x=SibSp,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

2.5.부모자식 동승자 생존율: 특이점 발견 못함 미선택

addmargins(prop.table(table(titanic$Parch, titanic$Survived)))
##      
##                 0           1         Sum
##   0   0.499438833 0.261503928 0.760942761
##   1   0.059483726 0.072951740 0.132435466
##   2   0.044893378 0.044893378 0.089786756
##   3   0.002244669 0.003367003 0.005611672
##   4   0.004489338 0.000000000 0.004489338
##   5   0.004489338 0.001122334 0.005611672
##   6   0.001122334 0.000000000 0.001122334
##   Sum 0.616161616 0.383838384 1.000000000
ggplot(titanic,aes(x=Parch,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

2.6. 요금에 따른 생존율:특이점 발견 못함 미선택

addmargins(prop.table(table(titanic$Fare, titanic$Survived)))
##           
##                      0           1         Sum
##   0        0.015712682 0.001122334 0.016835017
##   4.0125   0.001122334 0.000000000 0.001122334
##   5        0.001122334 0.000000000 0.001122334
##   6.2375   0.001122334 0.000000000 0.001122334
##   6.4375   0.001122334 0.000000000 0.001122334
##   6.45     0.001122334 0.000000000 0.001122334
##   6.4958   0.002244669 0.000000000 0.002244669
##   6.75     0.002244669 0.000000000 0.002244669
##   6.8583   0.001122334 0.000000000 0.001122334
##   6.95     0.001122334 0.000000000 0.001122334
##   6.975    0.001122334 0.001122334 0.002244669
##   7.0458   0.001122334 0.000000000 0.001122334
##   7.05     0.007856341 0.000000000 0.007856341
##   7.0542   0.002244669 0.000000000 0.002244669
##   7.125    0.004489338 0.000000000 0.004489338
##   7.1417   0.000000000 0.001122334 0.001122334
##   7.225    0.010101010 0.003367003 0.013468013
##   7.2292   0.012345679 0.004489338 0.016835017
##   7.25     0.013468013 0.001122334 0.014590348
##   7.3125   0.001122334 0.000000000 0.001122334
##   7.4958   0.002244669 0.001122334 0.003367003
##   7.5208   0.001122334 0.000000000 0.001122334
##   7.55     0.003367003 0.001122334 0.004489338
##   7.6292   0.001122334 0.000000000 0.001122334
##   7.65     0.003367003 0.001122334 0.004489338
##   7.725    0.001122334 0.000000000 0.001122334
##   7.7292   0.001122334 0.000000000 0.001122334
##   7.7333   0.002244669 0.002244669 0.004489338
##   7.7375   0.001122334 0.001122334 0.002244669
##   7.7417   0.001122334 0.000000000 0.001122334
##   7.75     0.024691358 0.013468013 0.038159371
##   7.775    0.014590348 0.003367003 0.017957351
##   7.7875   0.000000000 0.001122334 0.001122334
##   7.7958   0.004489338 0.002244669 0.006734007
##   7.8      0.001122334 0.000000000 0.001122334
##   7.8292   0.001122334 0.001122334 0.002244669
##   7.8542   0.011223345 0.003367003 0.014590348
##   7.875    0.001122334 0.000000000 0.001122334
##   7.8792   0.000000000 0.004489338 0.004489338
##   7.8875   0.001122334 0.000000000 0.001122334
##   7.8958   0.041526375 0.001122334 0.042648709
##   7.925    0.011223345 0.008978676 0.020202020
##   8.0292   0.000000000 0.001122334 0.001122334
##   8.05     0.042648709 0.005611672 0.048260382
##   8.1125   0.000000000 0.001122334 0.001122334
##   8.1375   0.001122334 0.000000000 0.001122334
##   8.1583   0.001122334 0.000000000 0.001122334
##   8.3      0.001122334 0.000000000 0.001122334
##   8.3625   0.001122334 0.000000000 0.001122334
##   8.4042   0.001122334 0.000000000 0.001122334
##   8.4333   0.001122334 0.000000000 0.001122334
##   8.4583   0.001122334 0.000000000 0.001122334
##   8.5167   0.000000000 0.001122334 0.001122334
##   8.6542   0.001122334 0.000000000 0.001122334
##   8.6625   0.013468013 0.001122334 0.014590348
##   8.6833   0.000000000 0.001122334 0.001122334
##   8.7125   0.001122334 0.000000000 0.001122334
##   8.85     0.001122334 0.000000000 0.001122334
##   9        0.002244669 0.000000000 0.002244669
##   9.2167   0.001122334 0.000000000 0.001122334
##   9.225    0.002244669 0.000000000 0.002244669
##   9.35     0.001122334 0.001122334 0.002244669
##   9.475    0.001122334 0.000000000 0.001122334
##   9.4833   0.001122334 0.000000000 0.001122334
##   9.5      0.007856341 0.002244669 0.010101010
##   9.5875   0.001122334 0.001122334 0.002244669
##   9.825    0.002244669 0.000000000 0.002244669
##   9.8375   0.001122334 0.000000000 0.001122334
##   9.8417   0.000000000 0.001122334 0.001122334
##   9.8458   0.001122334 0.000000000 0.001122334
##   10.1708  0.001122334 0.000000000 0.001122334
##   10.4625  0.002244669 0.000000000 0.002244669
##   10.5     0.016835017 0.010101010 0.026936027
##   10.5167  0.001122334 0.000000000 0.001122334
##   11.1333  0.000000000 0.003367003 0.003367003
##   11.2417  0.000000000 0.002244669 0.002244669
##   11.5     0.004489338 0.000000000 0.004489338
##   12       0.000000000 0.001122334 0.001122334
##   12.275   0.001122334 0.000000000 0.001122334
##   12.2875  0.000000000 0.001122334 0.001122334
##   12.35    0.001122334 0.002244669 0.003367003
##   12.475   0.000000000 0.004489338 0.004489338
##   12.525   0.001122334 0.000000000 0.001122334
##   12.65    0.000000000 0.001122334 0.001122334
##   12.875   0.001122334 0.000000000 0.001122334
##   13       0.029180696 0.017957351 0.047138047
##   13.4167  0.000000000 0.001122334 0.001122334
##   13.5     0.003367003 0.001122334 0.004489338
##   13.7917  0.000000000 0.001122334 0.001122334
##   13.8583  0.000000000 0.001122334 0.001122334
##   13.8625  0.000000000 0.001122334 0.001122334
##   14       0.001122334 0.000000000 0.001122334
##   14.1083  0.001122334 0.000000000 0.001122334
##   14.4     0.002244669 0.000000000 0.002244669
##   14.4542  0.006734007 0.001122334 0.007856341
##   14.4583  0.003367003 0.000000000 0.003367003
##   14.5     0.005611672 0.002244669 0.007856341
##   15       0.001122334 0.000000000 0.001122334
##   15.0458  0.001122334 0.000000000 0.001122334
##   15.05    0.001122334 0.000000000 0.001122334
##   15.1     0.001122334 0.000000000 0.001122334
##   15.2458  0.002244669 0.003367003 0.005611672
##   15.5     0.005611672 0.003367003 0.008978676
##   15.55    0.001122334 0.000000000 0.001122334
##   15.7417  0.000000000 0.002244669 0.002244669
##   15.75    0.000000000 0.001122334 0.001122334
##   15.85    0.002244669 0.002244669 0.004489338
##   15.9     0.000000000 0.002244669 0.002244669
##   16       0.000000000 0.001122334 0.001122334
##   16.1     0.007856341 0.002244669 0.010101010
##   16.7     0.000000000 0.002244669 0.002244669
##   17.4     0.000000000 0.001122334 0.001122334
##   17.8     0.002244669 0.000000000 0.002244669
##   18       0.003367003 0.000000000 0.003367003
##   18.75    0.000000000 0.003367003 0.003367003
##   18.7875  0.001122334 0.001122334 0.002244669
##   19.2583  0.000000000 0.004489338 0.004489338
##   19.5     0.000000000 0.002244669 0.002244669
##   19.9667  0.002244669 0.000000000 0.002244669
##   20.2125  0.002244669 0.000000000 0.002244669
##   20.25    0.001122334 0.001122334 0.002244669
##   20.525   0.001122334 0.002244669 0.003367003
##   20.575   0.001122334 0.001122334 0.002244669
##   21       0.004489338 0.002244669 0.006734007
##   21.075   0.004489338 0.000000000 0.004489338
##   21.6792  0.001122334 0.000000000 0.001122334
##   22.025   0.000000000 0.001122334 0.001122334
##   22.3583  0.000000000 0.002244669 0.002244669
##   22.525   0.001122334 0.000000000 0.001122334
##   23       0.000000000 0.004489338 0.004489338
##   23.25    0.000000000 0.002244669 0.002244669
##   23.45    0.002244669 0.000000000 0.002244669
##   24       0.001122334 0.001122334 0.002244669
##   24.15    0.007856341 0.001122334 0.008978676
##   25.4667  0.004489338 0.000000000 0.004489338
##   25.5875  0.001122334 0.000000000 0.001122334
##   25.925   0.001122334 0.000000000 0.001122334
##   25.9292  0.000000000 0.002244669 0.002244669
##   26       0.017957351 0.016835017 0.034792368
##   26.25    0.002244669 0.004489338 0.006734007
##   26.2833  0.000000000 0.001122334 0.001122334
##   26.2875  0.000000000 0.003367003 0.003367003
##   26.3875  0.000000000 0.001122334 0.001122334
##   26.55    0.007856341 0.008978676 0.016835017
##   27       0.001122334 0.001122334 0.002244669
##   27.7208  0.004489338 0.001122334 0.005611672
##   27.75    0.002244669 0.002244669 0.004489338
##   27.9     0.006734007 0.000000000 0.006734007
##   28.5     0.001122334 0.000000000 0.001122334
##   28.7125  0.001122334 0.000000000 0.001122334
##   29       0.000000000 0.002244669 0.002244669
##   29.125   0.005611672 0.000000000 0.005611672
##   29.7     0.002244669 0.001122334 0.003367003
##   30       0.001122334 0.005611672 0.006734007
##   30.0708  0.001122334 0.001122334 0.002244669
##   30.5     0.001122334 0.004489338 0.005611672
##   30.6958  0.002244669 0.000000000 0.002244669
##   31       0.001122334 0.002244669 0.003367003
##   31.275   0.007856341 0.000000000 0.007856341
##   31.3875  0.001122334 0.003367003 0.004489338
##   32.3208  0.001122334 0.000000000 0.001122334
##   32.5     0.000000000 0.001122334 0.001122334
##   33       0.001122334 0.002244669 0.003367003
##   33.5     0.001122334 0.000000000 0.001122334
##   34.0208  0.001122334 0.000000000 0.001122334
##   34.375   0.004489338 0.000000000 0.004489338
##   34.6542  0.001122334 0.000000000 0.001122334
##   35       0.001122334 0.000000000 0.001122334
##   35.5     0.001122334 0.003367003 0.004489338
##   36.75    0.001122334 0.001122334 0.002244669
##   37.0042  0.001122334 0.001122334 0.002244669
##   38.5     0.001122334 0.000000000 0.001122334
##   39       0.001122334 0.003367003 0.004489338
##   39.4     0.000000000 0.001122334 0.001122334
##   39.6     0.001122334 0.001122334 0.002244669
##   39.6875  0.006734007 0.000000000 0.006734007
##   40.125   0.001122334 0.000000000 0.001122334
##   41.5792  0.001122334 0.002244669 0.003367003
##   42.4     0.001122334 0.000000000 0.001122334
##   46.9     0.006734007 0.000000000 0.006734007
##   47.1     0.001122334 0.000000000 0.001122334
##   49.5     0.000000000 0.001122334 0.001122334
##   49.5042  0.001122334 0.001122334 0.002244669
##   50       0.001122334 0.000000000 0.001122334
##   50.4958  0.001122334 0.000000000 0.001122334
##   51.4792  0.000000000 0.001122334 0.001122334
##   51.8625  0.001122334 0.001122334 0.002244669
##   52       0.004489338 0.003367003 0.007856341
##   52.5542  0.000000000 0.003367003 0.003367003
##   53.1     0.002244669 0.003367003 0.005611672
##   55       0.000000000 0.002244669 0.002244669
##   55.4417  0.000000000 0.001122334 0.001122334
##   55.9     0.001122334 0.001122334 0.002244669
##   56.4958  0.002244669 0.005611672 0.007856341
##   56.9292  0.000000000 0.002244669 0.002244669
##   57       0.000000000 0.002244669 0.002244669
##   57.9792  0.000000000 0.002244669 0.002244669
##   59.4     0.000000000 0.001122334 0.001122334
##   61.175   0.001122334 0.000000000 0.001122334
##   61.3792  0.001122334 0.000000000 0.001122334
##   61.9792  0.001122334 0.000000000 0.001122334
##   63.3583  0.000000000 0.001122334 0.001122334
##   65       0.000000000 0.002244669 0.002244669
##   66.6     0.001122334 0.001122334 0.002244669
##   69.3     0.000000000 0.002244669 0.002244669
##   69.55    0.007856341 0.000000000 0.007856341
##   71       0.001122334 0.001122334 0.002244669
##   71.2833  0.000000000 0.001122334 0.001122334
##   73.5     0.005611672 0.000000000 0.005611672
##   75.25    0.000000000 0.001122334 0.001122334
##   76.2917  0.000000000 0.001122334 0.001122334
##   76.7292  0.000000000 0.003367003 0.003367003
##   77.2875  0.002244669 0.000000000 0.002244669
##   77.9583  0.000000000 0.003367003 0.003367003
##   78.2667  0.000000000 0.002244669 0.002244669
##   78.85    0.001122334 0.001122334 0.002244669
##   79.2     0.002244669 0.002244669 0.004489338
##   79.65    0.001122334 0.002244669 0.003367003
##   80       0.000000000 0.002244669 0.002244669
##   81.8583  0.000000000 0.001122334 0.001122334
##   82.1708  0.001122334 0.001122334 0.002244669
##   83.1583  0.000000000 0.003367003 0.003367003
##   83.475   0.001122334 0.001122334 0.002244669
##   86.5     0.000000000 0.003367003 0.003367003
##   89.1042  0.000000000 0.002244669 0.002244669
##   90       0.001122334 0.003367003 0.004489338
##   91.0792  0.000000000 0.002244669 0.002244669
##   93.5     0.000000000 0.002244669 0.002244669
##   106.425  0.001122334 0.001122334 0.002244669
##   108.9    0.001122334 0.001122334 0.002244669
##   110.8833 0.001122334 0.003367003 0.004489338
##   113.275  0.001122334 0.002244669 0.003367003
##   120      0.000000000 0.004489338 0.004489338
##   133.65   0.000000000 0.002244669 0.002244669
##   134.5    0.000000000 0.002244669 0.002244669
##   135.6333 0.001122334 0.002244669 0.003367003
##   146.5208 0.000000000 0.002244669 0.002244669
##   151.55   0.002244669 0.002244669 0.004489338
##   153.4625 0.001122334 0.002244669 0.003367003
##   164.8667 0.000000000 0.002244669 0.002244669
##   211.3375 0.000000000 0.003367003 0.003367003
##   211.5    0.001122334 0.000000000 0.001122334
##   221.7792 0.001122334 0.000000000 0.001122334
##   227.525  0.001122334 0.003367003 0.004489338
##   247.5208 0.001122334 0.001122334 0.002244669
##   262.375  0.000000000 0.002244669 0.002244669
##   263      0.002244669 0.002244669 0.004489338
##   512.3292 0.000000000 0.003367003 0.003367003
##   Sum      0.616161616 0.383838384 1.000000000

2.7. 승선지역에 따른 생존율: C지역 승선객들의 생존 비율이 높은 것으로 나타남. 선택

addmargins(prop.table(table(titanic$Embarked, titanic$Survived)))
##      
##                0          1        Sum
##   C   0.08436445 0.10461192 0.18897638
##   Q   0.05286839 0.03374578 0.08661417
##   S   0.48031496 0.24409449 0.72440945
##   Sum 0.61754781 0.38245219 1.00000000
ggplot(titanic,aes(x=Embarked,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))

2.8.승선지역과 승선등급별 생존율:승선지역과 승선등급은 생존여부에 영향을 미치는 것으로 판단.

ggplot(titanic,aes(x=Pclass,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+facet_wrap(~Embarked)+theme_bw()

2.9 승무원 생존율: 승무원의 생존율이 높음. 선택

addmargins(prop.table(table(titanic$Cabin_Derived, titanic$Survived)))
##            
##                      0          1        Sum
##   Cabin     0.07631874 0.15263749 0.22895623
##   Passenger 0.53984287 0.23120090 0.77104377
##   Sum       0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Cabin_Derived,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

3. Train(791 obs) & Test(100 obs) Data Set 생성

set.seed(9999)
samp<-sample(nrow(titanic), 791, replace = F) 
train<-titanic[samp,]
test<-titanic[-samp,]

4.Decision Tree 만들기

fit <- rpart(Survived ~., data=subset(train,!is.na(Age),select=c(Survived,Pclass,Sex,Age,Embarked,Cabin_Derived)), minbucket =20) #생존에 영향을 미치는 변수로 Age, Pclass(선실등급), 성별, 승선도시, 승무원 여부 등 5개 선택
summary(fit)
## Call:
## rpart(formula = Survived ~ ., data = subset(train, !is.na(Age), 
##     select = c(Survived, Pclass, Sex, Age, Embarked, Cabin_Derived)), 
##     minbucket = 20)
##   n= 636 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.29627268      0 1.0000000 1.0017800 0.01554345
## 2 0.07700486      1 0.7037273 0.7072918 0.03852536
## 3 0.04027813      2 0.6267225 0.6308317 0.03673766
## 4 0.01657573      3 0.5864443 0.5945572 0.03566076
## 5 0.01565968      4 0.5698686 0.6076780 0.03725464
## 6 0.01000000      5 0.5542089 0.5901032 0.03695116
## 
## Variable importance
##           Sex        Pclass Cabin_Derived           Age      Embarked 
##            57            20            11            10             1 
## 
## Node number 1: 636 observations,    complexity param=0.2962727
##   mean=0.4040881, MSE=0.2408009 
##   left son=2 (403 obs) right son=3 (233 obs)
##   Primary splits:
##       Sex           splits as  RL,       improve=0.29627270, (0 missing)
##       Pclass        < 2.5  to the right, improve=0.10731890, (0 missing)
##       Cabin_Derived splits as  RL,       improve=0.09795191, (0 missing)
##       Embarked      splits as  RLL,      improve=0.03911036, (2 missing)
##       Age           < 6.5  to the right, improve=0.01852715, (0 missing)
##   Surrogate splits:
##       Age < 15.5 to the right, agree=0.64, adj=0.017, (0 split)
## 
## Node number 2: 403 observations,    complexity param=0.04027813
##   mean=0.2009926, MSE=0.1605945 
##   left son=4 (316 obs) right son=5 (87 obs)
##   Primary splits:
##       Cabin_Derived splits as  RL,       improve=0.09531223, (0 missing)
##       Age           < 6.5  to the right, improve=0.06815317, (0 missing)
##       Pclass        < 1.5  to the right, improve=0.06671570, (0 missing)
##       Embarked      splits as  RLL,      improve=0.02279722, (0 missing)
##   Surrogate splits:
##       Pclass < 1.5  to the right, agree=0.938, adj=0.713, (0 split)
## 
## Node number 3: 233 observations,    complexity param=0.07700486
##   mean=0.7553648, MSE=0.1847888 
##   left son=6 (95 obs) right son=7 (138 obs)
##   Primary splits:
##       Pclass        < 2.5  to the right, improve=0.27390610, (0 missing)
##       Cabin_Derived splits as  RL,       improve=0.07423864, (0 missing)
##       Embarked      splits as  RLL,      improve=0.03889985, (2 missing)
##       Age           < 12   to the left,  improve=0.03566095, (0 missing)
##   Surrogate splits:
##       Cabin_Derived splits as  RL,       agree=0.695, adj=0.253, (0 split)
##       Age           < 22.5 to the left,  agree=0.678, adj=0.211, (0 split)
##       Embarked      splits as  RLR,      agree=0.614, adj=0.053, (0 split)
## 
## Node number 4: 316 observations,    complexity param=0.01565968
##   mean=0.1360759, MSE=0.1175593 
##   left son=8 (294 obs) right son=9 (22 obs)
##   Primary splits:
##       Age      < 9.5  to the right, improve=0.0645586000, (0 missing)
##       Embarked splits as  RLL,      improve=0.0100961400, (0 missing)
##       Pclass   < 2.5  to the left,  improve=0.0002551339, (0 missing)
## 
## Node number 5: 87 observations,    complexity param=0.01657573
##   mean=0.4367816, MSE=0.2460034 
##   left son=10 (35 obs) right son=11 (52 obs)
##   Primary splits:
##       Age      < 43   to the right, improve=0.118611700, (0 missing)
##       Embarked splits as  RLL,      improve=0.003654971, (0 missing)
##   Surrogate splits:
##       Embarked splits as  RLR, agree=0.609, adj=0.029, (0 split)
## 
## Node number 6: 95 observations
##   mean=0.4842105, MSE=0.2497507 
## 
## Node number 7: 138 observations
##   mean=0.942029, MSE=0.05461038 
## 
## Node number 8: 294 observations
##   mean=0.1122449, MSE=0.09964598 
## 
## Node number 9: 22 observations
##   mean=0.4545455, MSE=0.2479339 
## 
## Node number 10: 35 observations
##   mean=0.2285714, MSE=0.1763265 
## 
## Node number 11: 52 observations
##   mean=0.5769231, MSE=0.2440828
fancyRpartPlot(fit)

###선택변수가 생존 여부에 미치는중요도는 성별이 57, 선실등급이 20, 승무원여부가 11, 나이가 10, 승선도시가 1로 나타남

5. Prediction : test 데이터로 예측모델을 활용하여 생존 예측 값 생ㅅ

pred<-predict(fit, newdata=test)

print(pred)
##         1         2         3         4         5         6         7 
## 0.9420290 0.4842105 0.9420290 0.1122449 0.4842105 0.9420290 0.9420290 
##         8         9        10        11        12        13        14 
## 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 
##        15        16        17        18        19        20        21 
## 0.4842105 0.1122449 0.5769231 0.9420290 0.1122449 0.4545455 0.9420290 
##        22        23        24        25        26        27        28 
## 0.2285714 0.1122449 0.4842105 0.1122449 0.1122449 0.4842105 0.1122449 
##        29        30        31        32        33        34        35 
## 0.4842105 0.4842105 0.4545455 0.9420290 0.9420290 0.1122449 0.1122449 
##        36        37        38        39        40        41        42 
## 0.1122449 0.9420290 0.9420290 0.1122449 0.1122449 0.9420290 0.1122449 
##        43        44        45        46        47        48        49 
## 0.4842105 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.5769231 
##        50        51        52        53        54        55        56 
## 0.2285714 0.9420290 0.2285714 0.5769231 0.1122449 0.9420290 0.9420290 
##        57        58        59        60        61        62        63 
## 0.1122449 0.1122449 0.9420290 0.9420290 0.9420290 0.9420290 0.1122449 
##        64        65        66        67        68        69        70 
## 0.5769231 0.1122449 0.1122449 0.1122449 0.4842105 0.1122449 0.9420290 
##        71        72        73        74        75        76        77 
## 0.1122449 0.1122449 0.4842105 0.1122449 0.1122449 0.4842105 0.1122449 
##        78        79        80        81        82        83        84 
## 0.1122449 0.2285714 0.1122449 0.1122449 0.4842105 0.1122449 0.1122449 
##        85        86        87        88        89        90        91 
## 0.1122449 0.9420290 0.4545455 0.9420290 0.9420290 0.1122449 0.4545455 
##        92        93        94        95        96        97        98 
## 0.1122449 0.1122449 0.9420290 0.1122449 0.9420290 0.1122449 0.9420290 
##        99       100 
## 0.4842105 0.1122449

6. 모델 Validation

test$Survived_pred<-ifelse(predict(fit,newdata=test)>0.5,1,0)
test$Survived_pred_prob<-predict(fit,newdata=test)
confusionMatrix(as.factor(test$Survived_pred),as.factor(test$Survived)) #실제 생존자와 예측 모델을 통한 생존자 비교
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 55 16
##          1  4 25
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7082, 0.8733)
##     No Information Rate : 0.59            
##     P-Value [Acc > NIR] : 6.861e-06       
##                                           
##                   Kappa : 0.5673          
##  Mcnemar's Test P-Value : 0.01391         
##                                           
##             Sensitivity : 0.9322          
##             Specificity : 0.6098          
##          Pos Pred Value : 0.7746          
##          Neg Pred Value : 0.8621          
##              Prevalence : 0.5900          
##          Detection Rate : 0.5500          
##    Detection Prevalence : 0.7100          
##       Balanced Accuracy : 0.7710          
##                                           
##        'Positive' Class : 0               
## 

모델의 생존예측 정확율은 0.79 임.