bigdata_part04

read.delim("titanic3.txt",header=TRUE,sep=",")->full
library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: ggplot2

## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: lattice

set.seed(1234)
train_list<-createDataPartition(y=full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
train<-full_train
test<-full_test
train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
full %>% select(-boat,-body,-home.dest)->full
full$survived<-ifelse(full$survived==0,"생존","사망")
table(full$embarked)

## 
##       C   Q   S 
##   2 270 123 914

full$survived<-as.factor(full$survived)
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked)
levels(full$embarked)

## [1] ""  "C" "Q" "S"

levels(full$embarked)[1]<-NA
table(full$embarked,useNA="always")

## 
##    C    Q    S <NA> 
##  270  123  914    2

full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))

##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0        0        0        0        0 
##     fare    cabin embarked    index 
##        0        0        0        0

library(recipes)

## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>% 
  step_center(age,sibsp,parch,fare) %>% 
  step_scale(age,sibsp,parch,fare) %>% 
  prep() %>% juice()->full

full%>% filter(index=="train") %>% select(-index,-name,-ticket,-cabin)->train
full %>% filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test
ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
                   classProbs = TRUE)
train(survived~.,data=train,
      method="rpart",metric='ROC',
      trControl=ctrl)->rffit

predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2

test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 사망 생존
##       사망   94   30
##       생존   38  154
##                                           
##                Accuracy : 0.7848          
##                  95% CI : (0.7353, 0.8288)
##     No Information Rate : 0.5823          
##     P-Value [Acc > NIR] : 2.405e-14       
##                                           
##                   Kappa : 0.5538          
##                                           
##  Mcnemar's Test P-Value : 0.396           
##                                           
##             Sensitivity : 0.7121          
##             Specificity : 0.8370          
##          Pos Pred Value : 0.7581          
##          Neg Pred Value : 0.8021          
##              Prevalence : 0.4177          
##          Detection Rate : 0.2975          
##    Detection Prevalence : 0.3924          
##       Balanced Accuracy : 0.7745          
##                                           
##        'Positive' Class : 사망            
##

test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 사망 생존
##       사망   94   30
##       생존   38  154
##                                           
##                Accuracy : 0.7848          
##                  95% CI : (0.7353, 0.8288)
##     No Information Rate : 0.5823          
##     P-Value [Acc > NIR] : 2.405e-14       
##                                           
##                   Kappa : 0.5538          
##                                           
##  Mcnemar's Test P-Value : 0.396           
##                                           
##             Sensitivity : 0.7121          
##             Specificity : 0.8370          
##          Pos Pred Value : 0.7581          
##          Neg Pred Value : 0.8021          
##              Prevalence : 0.4177          
##          Detection Rate : 0.2975          
##    Detection Prevalence : 0.3924          
##       Balanced Accuracy : 0.7745          
##                                           
##        'Positive' Class : 사망            
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 다음의 패키지를 부착합니다: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rffit2_num<-as.numeric(rffit2)
rffit2_num

##   [1] 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 1 2 2 1 1 1 1 2 1 2 2 1 1 2 1
##  [38] 1 2 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 2 1 2 2 1 1 1 2 2 2 1 1 2 1 1 2 2
##  [75] 1 2 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 1 1 2 2 1 1
## [112] 1 2 1 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 1 1 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 2 2 2 2 2 1 2 2 1 2
## [186] 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1
## [223] 1 1 1 2 2 1 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 2
## [260] 1 1 2 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [297] 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2

result1<-pROC::roc(test$survived,rffit2_num)

## Setting levels: control = 사망, case = 생존

## Setting direction: controls < cases

result1

## 
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
## 
## Data: rffit2_num in 132 controls (test$survived 사망) < 184 cases (test$survived 생존).
## Area under the curve: 0.7745

bigdata_part04_10

kim kye chul

2022 5 28