rm(list=ls())
getwd()
## [1] "C:/R"
setwd("c:/R")
ls()
## character(0)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
read.delim("titanic3.txt",header=TRUE,sep=",")->full
full%>% glimpse
## Rows: 1,309
## Columns: 14
## $ pclass <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ survived <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, ~
## $ name <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tr~
## $ sex <chr> "female", "male", "female", "male", "female", "male", "femal~
## $ age <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 39.00, 53.00,~
## $ sibsp <int> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ parch <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ~
## $ ticket <chr> "24160", "113781", "113781", "113781", "113781", "19952", "1~
## $ fare <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 7~
## $ cabin <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7~
## $ embarked <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", "C", "C", "C", ~
## $ boat <chr> "2", "11", "", "", "", "3", "10", "", "D", "", "", "4", "9",~
## $ body <int> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, NA, NA~
## $ home.dest <chr> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Montreal~
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
set.seed(1234)
train_list<-createDataPartition(y=full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
NROW(full_train)
## [1] 917
NROW(full_test)
## [1] 392
train<-full_train
test<-full_test
train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
full %>% select(-boat,-body,-home.dest)->full
full %>% glimpse
## Rows: 1,309
## Columns: 12
## $ pclass <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ survived <int> 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0~
## $ name <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tre~
## $ sex <chr> "female", "male", "male", "male", "female", "male", "male", "~
## $ age <dbl> 29.00, 0.92, 30.00, 48.00, 63.00, 71.00, 47.00, 24.00, 24.00,~
## $ sibsp <int> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0~
## $ parch <int> 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ ticket <chr> "24160", "113781", "113781", "19952", "13502", "PC 17609", "P~
## $ fare <dbl> 211.3375, 151.5500, 151.5500, 26.5500, 77.9583, 49.5042, 227.~
## $ cabin <chr> "B5", "C22 C26", "C22 C26", "E12", "D7", "", "C62 C64", "B35"~
## $ embarked <chr> "S", "S", "S", "S", "S", "C", "C", "C", "C", "C", "C", "S", "~
## $ index <chr> "train", "train", "train", "train", "train", "train", "train"~
library(caret)
full$survived<-ifelse(full$survived==0,"생존","사망")
full$survived<-as.factor(full$survived)
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked)
summary(full)
## pclass survived name sex age
## 1:323 사망:500 Length:1309 female:466 Min. : 0.17
## 2:277 생존:809 Class :character male :843 1st Qu.:21.00
## 3:709 Mode :character Median :28.00
## Mean :29.88
## 3rd Qu.:39.00
## Max. :80.00
## NA's :263
## sibsp parch ticket fare
## Min. :0.0000 Min. :0.000 Length:1309 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 Class :character 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Mode :character Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
## cabin embarked index
## Length:1309 : 2 Length:1309
## Class :character C:270 Class :character
## Mode :character Q:123 Mode :character
## S:914
##
##
##
str(full)
## 'data.frame': 1309 obs. of 12 variables:
## $ pclass : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived: Factor w/ 2 levels "사망","생존": 1 1 2 1 1 2 2 1 2 1 ...
## $ name : chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Mr. Hudson Joshua Creighton" "Anderson, Mr. Harry" ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 1 2 2 1 2 1 ...
## $ age : num 29 0.92 30 48 63 71 47 24 24 32 ...
## $ sibsp : int 0 1 1 0 1 0 1 0 0 0 ...
## $ parch : int 0 2 2 0 0 0 0 0 1 0 ...
## $ ticket : chr "24160" "113781" "113781" "19952" ...
## $ fare : num 211.3 151.6 151.6 26.6 78 ...
## $ cabin : chr "B5" "C22 C26" "C22 C26" "E12" ...
## $ embarked: Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 2 2 2 2 2 ...
## $ index : chr "train" "train" "train" "train" ...
levels(full$embarked)[1]<-NA
table(full$embarked,useNA="always")
##
## C Q S <NA>
## 270 123 914 2
full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 0 0 0 0
## fare cabin embarked index
## 0 0 0 0
library(recipes)
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>%
step_center(age,sibsp,parch,fare) %>%
step_scale(age,sibsp,parch,fare) %>%
prep() %>% juice()->full
full%>% filter(index=='train') %>% select(-index,-name,-ticket,-cabin)->train
full%>%filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary,
classProbs=TRUE)
train(survived~.,data=train,
method="rpart",metric='ROC',
trControl=ctrl)->rffit
rffit
## CART
##
## 727 samples
## 7 predictor
## 2 classes: '사망', '생존'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 655, 655, 654, 654, 654, 655, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.02133106 0.8000892 0.6281609 0.8827167
## 0.03242321 0.7689156 0.6722989 0.8387949
## 0.45051195 0.6362184 0.3786207 0.8938161
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02133106.
test %>% glimpse()
## Rows: 316
## Columns: 8
## $ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ sex <fct> female, female, male, female, female, female, male, female, m~
## $ age <dbl> -2.23651014, -0.27131350, 0.66758473, 1.52740429, -0.78647164~
## $ sibsp <dbl> 1.2828274, 1.2828274, -0.7239369, 1.5906279, 1.2828274, -0.72~
## $ parch <dbl> 1.7467662, 1.7467662, -0.6019626, -0.6019626, -0.6019626, -0.~
## $ fare <dbl> 1.8357480, 1.8357480, -4.3263865, 0.9629375, 2.1234978, 1.327~
## $ embarked <fct> S, S, S, S, C, S, S, C, C, S, S, C, C, C, S, C, S, C, S, C, S~
## $ survived <fct> 생존, 생존, 생존, 사망, 사망, 사망, 사망, 사망, 생존, 사망, ~
predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2
head(rffit1)
## 사망 생존
## 1 0.7426471 0.2573529
## 2 0.7426471 0.2573529
## 3 0.1662708 0.8337292
## 4 0.7426471 0.2573529
## 5 0.7426471 0.2573529
## 6 0.7426471 0.2573529
test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 사망 생존
## 사망 94 30
## 생존 38 154
##
## Accuracy : 0.7848
## 95% CI : (0.7353, 0.8288)
## No Information Rate : 0.5823
## P-Value [Acc > NIR] : 2.405e-14
##
## Kappa : 0.5538
##
## Mcnemar's Test P-Value : 0.396
##
## Sensitivity : 0.7121
## Specificity : 0.8370
## Pos Pred Value : 0.7581
## Neg Pred Value : 0.8021
## Prevalence : 0.4177
## Detection Rate : 0.2975
## Detection Prevalence : 0.3924
## Balanced Accuracy : 0.7745
##
## 'Positive' Class : 사망
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rffit2_num<-as.numeric(rffit2)
rffit2_num
## [1] 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 1 2 2 1 1 1 1 2 1 2 2 1 1 2 1
## [38] 1 2 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 2 1 2 2 1 1 1 2 2 2 1 1 2 1 1 2 2
## [75] 1 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 1 1 2 2 1 1
## [112] 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 1 1 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 2 2 2 2 2 1 2 2 1 2
## [186] 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1
## [223] 1 1 1 2 2 1 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 2
## [260] 1 1 2 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [297] 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2
result1<-pROC::roc(test$survived,rffit2_num)
## Setting levels: control = 사망, case = 생존
## Setting direction: controls < cases
result1
##
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
##
## Data: rffit2_num in 132 controls (test$survived 사망) < 184 cases (test$survived 생존).
## Area under the curve: 0.7745