titanic<-read_csv("D:/R_File/Decision-Tree/titanic_data.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Survived = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
###2.데이터 탐색 및 필요 변수 생성
glimpse(titanic) # 891개 Obs, 12개 변수
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex <chr> "male", "female", "female", "female", "male", "mal...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
titanic$Cabin_Derived<-ifelse(!(is.na(titanic$Cabin)), "Cabin","Passenger")# 승객, 승무원 구분을 위해 생성
addmargins(prop.table(table(titanic$Pclass, titanic$Survived)))
##
## 0 1 Sum
## 1 0.08978676 0.15263749 0.24242424
## 2 0.10886644 0.09764310 0.20650954
## 3 0.41750842 0.13355780 0.55106622
## Sum 0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Pclass,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
addmargins(prop.table(table(titanic$Sex, titanic$Survived)))
##
## 0 1 Sum
## female 0.09090909 0.26150393 0.35241302
## male 0.52525253 0.12233446 0.64758698
## Sum 0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Sex,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
addmargins(prop.table(table(titanic$Age, titanic$Survived)))
##
## 0 1 Sum
## 0.42 0.000000000 0.001400560 0.001400560
## 0.67 0.000000000 0.001400560 0.001400560
## 0.75 0.000000000 0.002801120 0.002801120
## 0.83 0.000000000 0.002801120 0.002801120
## 0.92 0.000000000 0.001400560 0.001400560
## 1 0.002801120 0.007002801 0.009803922
## 2 0.009803922 0.004201681 0.014005602
## 3 0.001400560 0.007002801 0.008403361
## 4 0.004201681 0.009803922 0.014005602
## 5 0.000000000 0.005602241 0.005602241
## 6 0.001400560 0.002801120 0.004201681
## 7 0.002801120 0.001400560 0.004201681
## 8 0.002801120 0.002801120 0.005602241
## 9 0.008403361 0.002801120 0.011204482
## 10 0.002801120 0.000000000 0.002801120
## 11 0.004201681 0.001400560 0.005602241
## 12 0.000000000 0.001400560 0.001400560
## 13 0.000000000 0.002801120 0.002801120
## 14 0.004201681 0.004201681 0.008403361
## 14.5 0.001400560 0.000000000 0.001400560
## 15 0.001400560 0.005602241 0.007002801
## 16 0.015406162 0.008403361 0.023809524
## 17 0.009803922 0.008403361 0.018207283
## 18 0.023809524 0.012605042 0.036414566
## 19 0.022408964 0.012605042 0.035014006
## 20 0.016806723 0.004201681 0.021008403
## 20.5 0.001400560 0.000000000 0.001400560
## 21 0.026610644 0.007002801 0.033613445
## 22 0.022408964 0.015406162 0.037815126
## 23 0.014005602 0.007002801 0.021008403
## 23.5 0.001400560 0.000000000 0.001400560
## 24 0.021008403 0.021008403 0.042016807
## 24.5 0.001400560 0.000000000 0.001400560
## 25 0.023809524 0.008403361 0.032212885
## 26 0.016806723 0.008403361 0.025210084
## 27 0.009803922 0.015406162 0.025210084
## 28 0.025210084 0.009803922 0.035014006
## 28.5 0.002801120 0.000000000 0.002801120
## 29 0.016806723 0.011204482 0.028011204
## 30 0.021008403 0.014005602 0.035014006
## 30.5 0.002801120 0.000000000 0.002801120
## 31 0.012605042 0.011204482 0.023809524
## 32 0.012605042 0.012605042 0.025210084
## 32.5 0.001400560 0.001400560 0.002801120
## 33 0.012605042 0.008403361 0.021008403
## 34 0.012605042 0.008403361 0.021008403
## 34.5 0.001400560 0.000000000 0.001400560
## 35 0.009803922 0.015406162 0.025210084
## 36 0.015406162 0.015406162 0.030812325
## 36.5 0.001400560 0.000000000 0.001400560
## 37 0.007002801 0.001400560 0.008403361
## 38 0.008403361 0.007002801 0.015406162
## 39 0.012605042 0.007002801 0.019607843
## 40 0.009803922 0.008403361 0.018207283
## 40.5 0.002801120 0.000000000 0.002801120
## 41 0.005602241 0.002801120 0.008403361
## 42 0.009803922 0.008403361 0.018207283
## 43 0.005602241 0.001400560 0.007002801
## 44 0.008403361 0.004201681 0.012605042
## 45 0.009803922 0.007002801 0.016806723
## 45.5 0.002801120 0.000000000 0.002801120
## 46 0.004201681 0.000000000 0.004201681
## 47 0.011204482 0.001400560 0.012605042
## 48 0.004201681 0.008403361 0.012605042
## 49 0.002801120 0.005602241 0.008403361
## 50 0.007002801 0.007002801 0.014005602
## 51 0.007002801 0.002801120 0.009803922
## 52 0.004201681 0.004201681 0.008403361
## 53 0.000000000 0.001400560 0.001400560
## 54 0.007002801 0.004201681 0.011204482
## 55 0.001400560 0.001400560 0.002801120
## 55.5 0.001400560 0.000000000 0.001400560
## 56 0.002801120 0.002801120 0.005602241
## 57 0.002801120 0.000000000 0.002801120
## 58 0.002801120 0.004201681 0.007002801
## 59 0.002801120 0.000000000 0.002801120
## 60 0.002801120 0.002801120 0.005602241
## 61 0.004201681 0.000000000 0.004201681
## 62 0.002801120 0.002801120 0.005602241
## 63 0.000000000 0.002801120 0.002801120
## 64 0.002801120 0.000000000 0.002801120
## 65 0.004201681 0.000000000 0.004201681
## 66 0.001400560 0.000000000 0.001400560
## 70 0.002801120 0.000000000 0.002801120
## 70.5 0.001400560 0.000000000 0.001400560
## 71 0.002801120 0.000000000 0.002801120
## 74 0.001400560 0.000000000 0.001400560
## 80 0.000000000 0.001400560 0.001400560
## Sum 0.593837535 0.406162465 1.000000000
ggplot(titanic,aes(x=Age,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
## Warning: Removed 177 rows containing non-finite values (stat_count).
## Warning: position_stack requires non-overlapping x intervals
addmargins(prop.table(table(titanic$Parch, titanic$Survived)))
##
## 0 1 Sum
## 0 0.499438833 0.261503928 0.760942761
## 1 0.059483726 0.072951740 0.132435466
## 2 0.044893378 0.044893378 0.089786756
## 3 0.002244669 0.003367003 0.005611672
## 4 0.004489338 0.000000000 0.004489338
## 5 0.004489338 0.001122334 0.005611672
## 6 0.001122334 0.000000000 0.001122334
## Sum 0.616161616 0.383838384 1.000000000
ggplot(titanic,aes(x=SibSp,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
addmargins(prop.table(table(titanic$Parch, titanic$Survived)))
##
## 0 1 Sum
## 0 0.499438833 0.261503928 0.760942761
## 1 0.059483726 0.072951740 0.132435466
## 2 0.044893378 0.044893378 0.089786756
## 3 0.002244669 0.003367003 0.005611672
## 4 0.004489338 0.000000000 0.004489338
## 5 0.004489338 0.001122334 0.005611672
## 6 0.001122334 0.000000000 0.001122334
## Sum 0.616161616 0.383838384 1.000000000
ggplot(titanic,aes(x=Parch,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
addmargins(prop.table(table(titanic$Fare, titanic$Survived)))
##
## 0 1 Sum
## 0 0.015712682 0.001122334 0.016835017
## 4.0125 0.001122334 0.000000000 0.001122334
## 5 0.001122334 0.000000000 0.001122334
## 6.2375 0.001122334 0.000000000 0.001122334
## 6.4375 0.001122334 0.000000000 0.001122334
## 6.45 0.001122334 0.000000000 0.001122334
## 6.4958 0.002244669 0.000000000 0.002244669
## 6.75 0.002244669 0.000000000 0.002244669
## 6.8583 0.001122334 0.000000000 0.001122334
## 6.95 0.001122334 0.000000000 0.001122334
## 6.975 0.001122334 0.001122334 0.002244669
## 7.0458 0.001122334 0.000000000 0.001122334
## 7.05 0.007856341 0.000000000 0.007856341
## 7.0542 0.002244669 0.000000000 0.002244669
## 7.125 0.004489338 0.000000000 0.004489338
## 7.1417 0.000000000 0.001122334 0.001122334
## 7.225 0.010101010 0.003367003 0.013468013
## 7.2292 0.012345679 0.004489338 0.016835017
## 7.25 0.013468013 0.001122334 0.014590348
## 7.3125 0.001122334 0.000000000 0.001122334
## 7.4958 0.002244669 0.001122334 0.003367003
## 7.5208 0.001122334 0.000000000 0.001122334
## 7.55 0.003367003 0.001122334 0.004489338
## 7.6292 0.001122334 0.000000000 0.001122334
## 7.65 0.003367003 0.001122334 0.004489338
## 7.725 0.001122334 0.000000000 0.001122334
## 7.7292 0.001122334 0.000000000 0.001122334
## 7.7333 0.002244669 0.002244669 0.004489338
## 7.7375 0.001122334 0.001122334 0.002244669
## 7.7417 0.001122334 0.000000000 0.001122334
## 7.75 0.024691358 0.013468013 0.038159371
## 7.775 0.014590348 0.003367003 0.017957351
## 7.7875 0.000000000 0.001122334 0.001122334
## 7.7958 0.004489338 0.002244669 0.006734007
## 7.8 0.001122334 0.000000000 0.001122334
## 7.8292 0.001122334 0.001122334 0.002244669
## 7.8542 0.011223345 0.003367003 0.014590348
## 7.875 0.001122334 0.000000000 0.001122334
## 7.8792 0.000000000 0.004489338 0.004489338
## 7.8875 0.001122334 0.000000000 0.001122334
## 7.8958 0.041526375 0.001122334 0.042648709
## 7.925 0.011223345 0.008978676 0.020202020
## 8.0292 0.000000000 0.001122334 0.001122334
## 8.05 0.042648709 0.005611672 0.048260382
## 8.1125 0.000000000 0.001122334 0.001122334
## 8.1375 0.001122334 0.000000000 0.001122334
## 8.1583 0.001122334 0.000000000 0.001122334
## 8.3 0.001122334 0.000000000 0.001122334
## 8.3625 0.001122334 0.000000000 0.001122334
## 8.4042 0.001122334 0.000000000 0.001122334
## 8.4333 0.001122334 0.000000000 0.001122334
## 8.4583 0.001122334 0.000000000 0.001122334
## 8.5167 0.000000000 0.001122334 0.001122334
## 8.6542 0.001122334 0.000000000 0.001122334
## 8.6625 0.013468013 0.001122334 0.014590348
## 8.6833 0.000000000 0.001122334 0.001122334
## 8.7125 0.001122334 0.000000000 0.001122334
## 8.85 0.001122334 0.000000000 0.001122334
## 9 0.002244669 0.000000000 0.002244669
## 9.2167 0.001122334 0.000000000 0.001122334
## 9.225 0.002244669 0.000000000 0.002244669
## 9.35 0.001122334 0.001122334 0.002244669
## 9.475 0.001122334 0.000000000 0.001122334
## 9.4833 0.001122334 0.000000000 0.001122334
## 9.5 0.007856341 0.002244669 0.010101010
## 9.5875 0.001122334 0.001122334 0.002244669
## 9.825 0.002244669 0.000000000 0.002244669
## 9.8375 0.001122334 0.000000000 0.001122334
## 9.8417 0.000000000 0.001122334 0.001122334
## 9.8458 0.001122334 0.000000000 0.001122334
## 10.1708 0.001122334 0.000000000 0.001122334
## 10.4625 0.002244669 0.000000000 0.002244669
## 10.5 0.016835017 0.010101010 0.026936027
## 10.5167 0.001122334 0.000000000 0.001122334
## 11.1333 0.000000000 0.003367003 0.003367003
## 11.2417 0.000000000 0.002244669 0.002244669
## 11.5 0.004489338 0.000000000 0.004489338
## 12 0.000000000 0.001122334 0.001122334
## 12.275 0.001122334 0.000000000 0.001122334
## 12.2875 0.000000000 0.001122334 0.001122334
## 12.35 0.001122334 0.002244669 0.003367003
## 12.475 0.000000000 0.004489338 0.004489338
## 12.525 0.001122334 0.000000000 0.001122334
## 12.65 0.000000000 0.001122334 0.001122334
## 12.875 0.001122334 0.000000000 0.001122334
## 13 0.029180696 0.017957351 0.047138047
## 13.4167 0.000000000 0.001122334 0.001122334
## 13.5 0.003367003 0.001122334 0.004489338
## 13.7917 0.000000000 0.001122334 0.001122334
## 13.8583 0.000000000 0.001122334 0.001122334
## 13.8625 0.000000000 0.001122334 0.001122334
## 14 0.001122334 0.000000000 0.001122334
## 14.1083 0.001122334 0.000000000 0.001122334
## 14.4 0.002244669 0.000000000 0.002244669
## 14.4542 0.006734007 0.001122334 0.007856341
## 14.4583 0.003367003 0.000000000 0.003367003
## 14.5 0.005611672 0.002244669 0.007856341
## 15 0.001122334 0.000000000 0.001122334
## 15.0458 0.001122334 0.000000000 0.001122334
## 15.05 0.001122334 0.000000000 0.001122334
## 15.1 0.001122334 0.000000000 0.001122334
## 15.2458 0.002244669 0.003367003 0.005611672
## 15.5 0.005611672 0.003367003 0.008978676
## 15.55 0.001122334 0.000000000 0.001122334
## 15.7417 0.000000000 0.002244669 0.002244669
## 15.75 0.000000000 0.001122334 0.001122334
## 15.85 0.002244669 0.002244669 0.004489338
## 15.9 0.000000000 0.002244669 0.002244669
## 16 0.000000000 0.001122334 0.001122334
## 16.1 0.007856341 0.002244669 0.010101010
## 16.7 0.000000000 0.002244669 0.002244669
## 17.4 0.000000000 0.001122334 0.001122334
## 17.8 0.002244669 0.000000000 0.002244669
## 18 0.003367003 0.000000000 0.003367003
## 18.75 0.000000000 0.003367003 0.003367003
## 18.7875 0.001122334 0.001122334 0.002244669
## 19.2583 0.000000000 0.004489338 0.004489338
## 19.5 0.000000000 0.002244669 0.002244669
## 19.9667 0.002244669 0.000000000 0.002244669
## 20.2125 0.002244669 0.000000000 0.002244669
## 20.25 0.001122334 0.001122334 0.002244669
## 20.525 0.001122334 0.002244669 0.003367003
## 20.575 0.001122334 0.001122334 0.002244669
## 21 0.004489338 0.002244669 0.006734007
## 21.075 0.004489338 0.000000000 0.004489338
## 21.6792 0.001122334 0.000000000 0.001122334
## 22.025 0.000000000 0.001122334 0.001122334
## 22.3583 0.000000000 0.002244669 0.002244669
## 22.525 0.001122334 0.000000000 0.001122334
## 23 0.000000000 0.004489338 0.004489338
## 23.25 0.000000000 0.002244669 0.002244669
## 23.45 0.002244669 0.000000000 0.002244669
## 24 0.001122334 0.001122334 0.002244669
## 24.15 0.007856341 0.001122334 0.008978676
## 25.4667 0.004489338 0.000000000 0.004489338
## 25.5875 0.001122334 0.000000000 0.001122334
## 25.925 0.001122334 0.000000000 0.001122334
## 25.9292 0.000000000 0.002244669 0.002244669
## 26 0.017957351 0.016835017 0.034792368
## 26.25 0.002244669 0.004489338 0.006734007
## 26.2833 0.000000000 0.001122334 0.001122334
## 26.2875 0.000000000 0.003367003 0.003367003
## 26.3875 0.000000000 0.001122334 0.001122334
## 26.55 0.007856341 0.008978676 0.016835017
## 27 0.001122334 0.001122334 0.002244669
## 27.7208 0.004489338 0.001122334 0.005611672
## 27.75 0.002244669 0.002244669 0.004489338
## 27.9 0.006734007 0.000000000 0.006734007
## 28.5 0.001122334 0.000000000 0.001122334
## 28.7125 0.001122334 0.000000000 0.001122334
## 29 0.000000000 0.002244669 0.002244669
## 29.125 0.005611672 0.000000000 0.005611672
## 29.7 0.002244669 0.001122334 0.003367003
## 30 0.001122334 0.005611672 0.006734007
## 30.0708 0.001122334 0.001122334 0.002244669
## 30.5 0.001122334 0.004489338 0.005611672
## 30.6958 0.002244669 0.000000000 0.002244669
## 31 0.001122334 0.002244669 0.003367003
## 31.275 0.007856341 0.000000000 0.007856341
## 31.3875 0.001122334 0.003367003 0.004489338
## 32.3208 0.001122334 0.000000000 0.001122334
## 32.5 0.000000000 0.001122334 0.001122334
## 33 0.001122334 0.002244669 0.003367003
## 33.5 0.001122334 0.000000000 0.001122334
## 34.0208 0.001122334 0.000000000 0.001122334
## 34.375 0.004489338 0.000000000 0.004489338
## 34.6542 0.001122334 0.000000000 0.001122334
## 35 0.001122334 0.000000000 0.001122334
## 35.5 0.001122334 0.003367003 0.004489338
## 36.75 0.001122334 0.001122334 0.002244669
## 37.0042 0.001122334 0.001122334 0.002244669
## 38.5 0.001122334 0.000000000 0.001122334
## 39 0.001122334 0.003367003 0.004489338
## 39.4 0.000000000 0.001122334 0.001122334
## 39.6 0.001122334 0.001122334 0.002244669
## 39.6875 0.006734007 0.000000000 0.006734007
## 40.125 0.001122334 0.000000000 0.001122334
## 41.5792 0.001122334 0.002244669 0.003367003
## 42.4 0.001122334 0.000000000 0.001122334
## 46.9 0.006734007 0.000000000 0.006734007
## 47.1 0.001122334 0.000000000 0.001122334
## 49.5 0.000000000 0.001122334 0.001122334
## 49.5042 0.001122334 0.001122334 0.002244669
## 50 0.001122334 0.000000000 0.001122334
## 50.4958 0.001122334 0.000000000 0.001122334
## 51.4792 0.000000000 0.001122334 0.001122334
## 51.8625 0.001122334 0.001122334 0.002244669
## 52 0.004489338 0.003367003 0.007856341
## 52.5542 0.000000000 0.003367003 0.003367003
## 53.1 0.002244669 0.003367003 0.005611672
## 55 0.000000000 0.002244669 0.002244669
## 55.4417 0.000000000 0.001122334 0.001122334
## 55.9 0.001122334 0.001122334 0.002244669
## 56.4958 0.002244669 0.005611672 0.007856341
## 56.9292 0.000000000 0.002244669 0.002244669
## 57 0.000000000 0.002244669 0.002244669
## 57.9792 0.000000000 0.002244669 0.002244669
## 59.4 0.000000000 0.001122334 0.001122334
## 61.175 0.001122334 0.000000000 0.001122334
## 61.3792 0.001122334 0.000000000 0.001122334
## 61.9792 0.001122334 0.000000000 0.001122334
## 63.3583 0.000000000 0.001122334 0.001122334
## 65 0.000000000 0.002244669 0.002244669
## 66.6 0.001122334 0.001122334 0.002244669
## 69.3 0.000000000 0.002244669 0.002244669
## 69.55 0.007856341 0.000000000 0.007856341
## 71 0.001122334 0.001122334 0.002244669
## 71.2833 0.000000000 0.001122334 0.001122334
## 73.5 0.005611672 0.000000000 0.005611672
## 75.25 0.000000000 0.001122334 0.001122334
## 76.2917 0.000000000 0.001122334 0.001122334
## 76.7292 0.000000000 0.003367003 0.003367003
## 77.2875 0.002244669 0.000000000 0.002244669
## 77.9583 0.000000000 0.003367003 0.003367003
## 78.2667 0.000000000 0.002244669 0.002244669
## 78.85 0.001122334 0.001122334 0.002244669
## 79.2 0.002244669 0.002244669 0.004489338
## 79.65 0.001122334 0.002244669 0.003367003
## 80 0.000000000 0.002244669 0.002244669
## 81.8583 0.000000000 0.001122334 0.001122334
## 82.1708 0.001122334 0.001122334 0.002244669
## 83.1583 0.000000000 0.003367003 0.003367003
## 83.475 0.001122334 0.001122334 0.002244669
## 86.5 0.000000000 0.003367003 0.003367003
## 89.1042 0.000000000 0.002244669 0.002244669
## 90 0.001122334 0.003367003 0.004489338
## 91.0792 0.000000000 0.002244669 0.002244669
## 93.5 0.000000000 0.002244669 0.002244669
## 106.425 0.001122334 0.001122334 0.002244669
## 108.9 0.001122334 0.001122334 0.002244669
## 110.8833 0.001122334 0.003367003 0.004489338
## 113.275 0.001122334 0.002244669 0.003367003
## 120 0.000000000 0.004489338 0.004489338
## 133.65 0.000000000 0.002244669 0.002244669
## 134.5 0.000000000 0.002244669 0.002244669
## 135.6333 0.001122334 0.002244669 0.003367003
## 146.5208 0.000000000 0.002244669 0.002244669
## 151.55 0.002244669 0.002244669 0.004489338
## 153.4625 0.001122334 0.002244669 0.003367003
## 164.8667 0.000000000 0.002244669 0.002244669
## 211.3375 0.000000000 0.003367003 0.003367003
## 211.5 0.001122334 0.000000000 0.001122334
## 221.7792 0.001122334 0.000000000 0.001122334
## 227.525 0.001122334 0.003367003 0.004489338
## 247.5208 0.001122334 0.001122334 0.002244669
## 262.375 0.000000000 0.002244669 0.002244669
## 263 0.002244669 0.002244669 0.004489338
## 512.3292 0.000000000 0.003367003 0.003367003
## Sum 0.616161616 0.383838384 1.000000000
addmargins(prop.table(table(titanic$Embarked, titanic$Survived)))
##
## 0 1 Sum
## C 0.08436445 0.10461192 0.18897638
## Q 0.05286839 0.03374578 0.08661417
## S 0.48031496 0.24409449 0.72440945
## Sum 0.61754781 0.38245219 1.00000000
ggplot(titanic,aes(x=Embarked,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))
ggplot(titanic,aes(x=Pclass,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+facet_wrap(~Embarked)+theme_bw()
addmargins(prop.table(table(titanic$Cabin_Derived, titanic$Survived)))
##
## 0 1 Sum
## Cabin 0.07631874 0.15263749 0.22895623
## Passenger 0.53984287 0.23120090 0.77104377
## Sum 0.61616162 0.38383838 1.00000000
ggplot(titanic,aes(x=Cabin_Derived,fill=factor(Survived)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
set.seed(9999)
samp<-sample(nrow(titanic), 791, replace = F)
train<-titanic[samp,]
test<-titanic[-samp,]
fit <- rpart(Survived ~., data=subset(train,!is.na(Age),select=c(Survived,Pclass,Sex,Age,Embarked,Cabin_Derived)), minbucket =20) #생존에 영향을 미치는 변수로 Age, Pclass(선실등급), 성별, 승선도시, 승무원 여부 등 5개 선택
summary(fit)
## Call:
## rpart(formula = Survived ~ ., data = subset(train, !is.na(Age),
## select = c(Survived, Pclass, Sex, Age, Embarked, Cabin_Derived)),
## minbucket = 20)
## n= 636
##
## CP nsplit rel error xerror xstd
## 1 0.29627268 0 1.0000000 1.0017800 0.01554345
## 2 0.07700486 1 0.7037273 0.7072918 0.03852536
## 3 0.04027813 2 0.6267225 0.6308317 0.03673766
## 4 0.01657573 3 0.5864443 0.5945572 0.03566076
## 5 0.01565968 4 0.5698686 0.6076780 0.03725464
## 6 0.01000000 5 0.5542089 0.5901032 0.03695116
##
## Variable importance
## Sex Pclass Cabin_Derived Age Embarked
## 57 20 11 10 1
##
## Node number 1: 636 observations, complexity param=0.2962727
## mean=0.4040881, MSE=0.2408009
## left son=2 (403 obs) right son=3 (233 obs)
## Primary splits:
## Sex splits as RL, improve=0.29627270, (0 missing)
## Pclass < 2.5 to the right, improve=0.10731890, (0 missing)
## Cabin_Derived splits as RL, improve=0.09795191, (0 missing)
## Embarked splits as RLL, improve=0.03911036, (2 missing)
## Age < 6.5 to the right, improve=0.01852715, (0 missing)
## Surrogate splits:
## Age < 15.5 to the right, agree=0.64, adj=0.017, (0 split)
##
## Node number 2: 403 observations, complexity param=0.04027813
## mean=0.2009926, MSE=0.1605945
## left son=4 (316 obs) right son=5 (87 obs)
## Primary splits:
## Cabin_Derived splits as RL, improve=0.09531223, (0 missing)
## Age < 6.5 to the right, improve=0.06815317, (0 missing)
## Pclass < 1.5 to the right, improve=0.06671570, (0 missing)
## Embarked splits as RLL, improve=0.02279722, (0 missing)
## Surrogate splits:
## Pclass < 1.5 to the right, agree=0.938, adj=0.713, (0 split)
##
## Node number 3: 233 observations, complexity param=0.07700486
## mean=0.7553648, MSE=0.1847888
## left son=6 (95 obs) right son=7 (138 obs)
## Primary splits:
## Pclass < 2.5 to the right, improve=0.27390610, (0 missing)
## Cabin_Derived splits as RL, improve=0.07423864, (0 missing)
## Embarked splits as RLL, improve=0.03889985, (2 missing)
## Age < 12 to the left, improve=0.03566095, (0 missing)
## Surrogate splits:
## Cabin_Derived splits as RL, agree=0.695, adj=0.253, (0 split)
## Age < 22.5 to the left, agree=0.678, adj=0.211, (0 split)
## Embarked splits as RLR, agree=0.614, adj=0.053, (0 split)
##
## Node number 4: 316 observations, complexity param=0.01565968
## mean=0.1360759, MSE=0.1175593
## left son=8 (294 obs) right son=9 (22 obs)
## Primary splits:
## Age < 9.5 to the right, improve=0.0645586000, (0 missing)
## Embarked splits as RLL, improve=0.0100961400, (0 missing)
## Pclass < 2.5 to the left, improve=0.0002551339, (0 missing)
##
## Node number 5: 87 observations, complexity param=0.01657573
## mean=0.4367816, MSE=0.2460034
## left son=10 (35 obs) right son=11 (52 obs)
## Primary splits:
## Age < 43 to the right, improve=0.118611700, (0 missing)
## Embarked splits as RLL, improve=0.003654971, (0 missing)
## Surrogate splits:
## Embarked splits as RLR, agree=0.609, adj=0.029, (0 split)
##
## Node number 6: 95 observations
## mean=0.4842105, MSE=0.2497507
##
## Node number 7: 138 observations
## mean=0.942029, MSE=0.05461038
##
## Node number 8: 294 observations
## mean=0.1122449, MSE=0.09964598
##
## Node number 9: 22 observations
## mean=0.4545455, MSE=0.2479339
##
## Node number 10: 35 observations
## mean=0.2285714, MSE=0.1763265
##
## Node number 11: 52 observations
## mean=0.5769231, MSE=0.2440828
fancyRpartPlot(fit)
###선택변수가 생존 여부에 미치는중요도는 성별이 57, 선실등급이 20, 승무원여부가 11, 나이가 10, 승선도시가 1로 나타남
pred<-predict(fit, newdata=test)
print(pred)
## 1 2 3 4 5 6 7
## 0.9420290 0.4842105 0.9420290 0.1122449 0.4842105 0.9420290 0.9420290
## 8 9 10 11 12 13 14
## 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449
## 15 16 17 18 19 20 21
## 0.4842105 0.1122449 0.5769231 0.9420290 0.1122449 0.4545455 0.9420290
## 22 23 24 25 26 27 28
## 0.2285714 0.1122449 0.4842105 0.1122449 0.1122449 0.4842105 0.1122449
## 29 30 31 32 33 34 35
## 0.4842105 0.4842105 0.4545455 0.9420290 0.9420290 0.1122449 0.1122449
## 36 37 38 39 40 41 42
## 0.1122449 0.9420290 0.9420290 0.1122449 0.1122449 0.9420290 0.1122449
## 43 44 45 46 47 48 49
## 0.4842105 0.1122449 0.1122449 0.1122449 0.1122449 0.1122449 0.5769231
## 50 51 52 53 54 55 56
## 0.2285714 0.9420290 0.2285714 0.5769231 0.1122449 0.9420290 0.9420290
## 57 58 59 60 61 62 63
## 0.1122449 0.1122449 0.9420290 0.9420290 0.9420290 0.9420290 0.1122449
## 64 65 66 67 68 69 70
## 0.5769231 0.1122449 0.1122449 0.1122449 0.4842105 0.1122449 0.9420290
## 71 72 73 74 75 76 77
## 0.1122449 0.1122449 0.4842105 0.1122449 0.1122449 0.4842105 0.1122449
## 78 79 80 81 82 83 84
## 0.1122449 0.2285714 0.1122449 0.1122449 0.4842105 0.1122449 0.1122449
## 85 86 87 88 89 90 91
## 0.1122449 0.9420290 0.4545455 0.9420290 0.9420290 0.1122449 0.4545455
## 92 93 94 95 96 97 98
## 0.1122449 0.1122449 0.9420290 0.1122449 0.9420290 0.1122449 0.9420290
## 99 100
## 0.4842105 0.1122449
test$Survived_pred<-ifelse(predict(fit,newdata=test)>0.5,1,0)
test$Survived_pred_prob<-predict(fit,newdata=test)
confusionMatrix(as.factor(test$Survived_pred),as.factor(test$Survived)) #실제 생존자와 예측 모델을 통한 생존자 비교
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 55 16
## 1 4 25
##
## Accuracy : 0.8
## 95% CI : (0.7082, 0.8733)
## No Information Rate : 0.59
## P-Value [Acc > NIR] : 6.861e-06
##
## Kappa : 0.5673
## Mcnemar's Test P-Value : 0.01391
##
## Sensitivity : 0.9322
## Specificity : 0.6098
## Pos Pred Value : 0.7746
## Neg Pred Value : 0.8621
## Prevalence : 0.5900
## Detection Rate : 0.5500
## Detection Prevalence : 0.7100
## Balanced Accuracy : 0.7710
##
## 'Positive' Class : 0
##