read.delim("titanic3.txt",header=TRUE,sep=",")->full
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
set.seed(1234)
train_list<-createDataPartition(y=full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
train<-full_train
test<-full_test
train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
full %>% select(-boat,-body,-home.dest)->full
full$survived<-ifelse(full$survived==0,"생존","사망")
table(full$embarked)
##
## C Q S
## 2 270 123 914
full$survived<-as.factor(full$survived)
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked)
levels(full$embarked)
## [1] "" "C" "Q" "S"
levels(full$embarked)[1]<-NA
table(full$embarked,useNA="always")
##
## C Q S <NA>
## 270 123 914 2
full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 0 0 0 0
## fare cabin embarked index
## 0 0 0 0
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>%
step_center(age,sibsp,parch,fare) %>%
step_scale(age,sibsp,parch,fare) %>%
prep() %>% juice()->full
full%>% filter(index=="train") %>% select(-index,-name,-ticket,-cabin)->train
full %>% filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test
ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
classProbs = TRUE)
train(survived~.,data=train,
method="rpart",metric='ROC',
trControl=ctrl)->rffit
predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2
test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 사망 생존
## 사망 94 30
## 생존 38 154
##
## Accuracy : 0.7848
## 95% CI : (0.7353, 0.8288)
## No Information Rate : 0.5823
## P-Value [Acc > NIR] : 2.405e-14
##
## Kappa : 0.5538
##
## Mcnemar's Test P-Value : 0.396
##
## Sensitivity : 0.7121
## Specificity : 0.8370
## Pos Pred Value : 0.7581
## Neg Pred Value : 0.8021
## Prevalence : 0.4177
## Detection Rate : 0.2975
## Detection Prevalence : 0.3924
## Balanced Accuracy : 0.7745
##
## 'Positive' Class : 사망
##
test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 사망 생존
## 사망 94 30
## 생존 38 154
##
## Accuracy : 0.7848
## 95% CI : (0.7353, 0.8288)
## No Information Rate : 0.5823
## P-Value [Acc > NIR] : 2.405e-14
##
## Kappa : 0.5538
##
## Mcnemar's Test P-Value : 0.396
##
## Sensitivity : 0.7121
## Specificity : 0.8370
## Pos Pred Value : 0.7581
## Neg Pred Value : 0.8021
## Prevalence : 0.4177
## Detection Rate : 0.2975
## Detection Prevalence : 0.3924
## Balanced Accuracy : 0.7745
##
## 'Positive' Class : 사망
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rffit2_num<-as.numeric(rffit2)
rffit2_num
## [1] 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 1 2 2 1 1 1 1 2 1 2 2 1 1 2 1
## [38] 1 2 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 2 1 2 2 1 1 1 2 2 2 1 1 2 1 1 2 2
## [75] 1 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 1 1 2 2 1 1
## [112] 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 1 1 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 2 2 2 2 2 1 2 2 1 2
## [186] 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1
## [223] 1 1 1 2 2 1 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 2
## [260] 1 1 2 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [297] 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2
result1<-pROC::roc(test$survived,rffit2_num)
## Setting levels: control = 사망, case = 생존
## Setting direction: controls < cases
result1
##
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
##
## Data: rffit2_num in 132 controls (test$survived 사망) < 184 cases (test$survived 생존).
## Area under the curve: 0.7745