bigdata_publish_rmarkdown_part04

library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

read.delim("titanic3.txt",header=TRUE,sep=",")->full
full %>% glimpse

## Rows: 1,309
## Columns: 14
## $ pclass    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ survived  <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, ~
## $ name      <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tr~
## $ sex       <chr> "female", "male", "female", "male", "female", "male", "femal~
## $ age       <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 39.00, 53.00,~
## $ sibsp     <int> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ parch     <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ~
## $ ticket    <chr> "24160", "113781", "113781", "113781", "113781", "19952", "1~
## $ fare      <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 7~
## $ cabin     <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7~
## $ embarked  <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", "C", "C", "C", ~
## $ boat      <chr> "2", "11", "", "", "", "3", "10", "", "D", "", "", "4", "9",~
## $ body      <int> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, NA, NA~
## $ home.dest <chr> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Montreal~

library(caret)

## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: ggplot2

## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: lattice

train_list<-createDataPartition(y=full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
NROW(full_train)

## [1] 917

NROW(full_test)

## [1] 392

train<-full_train
test<-full_test
library(dplyr)
train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
full %>% select(-boat,-body,-home.dest)->full
full %>% glimpse

## Rows: 1,309
## Columns: 12
## $ pclass   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ survived <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1~
## $ name     <chr> "Allen, Miss. Elisabeth Walton", "Allison, Miss. Helen Lorain~
## $ sex      <chr> "female", "female", "male", "female", "male", "female", "male~
## $ age      <dbl> 29, 2, 30, 25, 39, 53, 71, 47, 18, 24, 80, 24, 50, 32, 36, 37~
## $ sibsp    <int> 0, 1, 1, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1~
## $ parch    <int> 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0~
## $ ticket   <chr> "24160", "113781", "113781", "113781", "112050", "11769", "PC~
## $ fare     <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 0.0000, 51.4792, 49.5~
## $ cabin    <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "A36", "C101", "", "C6~
## $ embarked <chr> "S", "S", "S", "S", "S", "S", "C", "C", "C", "C", "S", "C", "~
## $ index    <chr> "train", "train", "train", "train", "train", "train", "train"~

full$survived<-as.factor(full$survived)
full$survived<-ifelse(full$survived==0,"생존","사망")
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked)
summary(full)

##  pclass    survived             name               sex           age       
##  1:323   Length:1309        Length:1309        female:466   Min.   : 0.17  
##  2:277   Class :character   Class :character   male  :843   1st Qu.:21.00  
##  3:709   Mode  :character   Mode  :character                Median :28.00  
##                                                             Mean   :29.88  
##                                                             3rd Qu.:39.00  
##                                                             Max.   :80.00  
##                                                             NA's   :263    
##      sibsp            parch          ticket               fare        
##  Min.   :0.0000   Min.   :0.000   Length:1309        Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   Class :character   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Mode  :character   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385                      Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000                      3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000                      Max.   :512.329  
##                                                      NA's   :1        
##     cabin           embarked    index          
##  Length:1309         :  2    Length:1309       
##  Class :character   C:270    Class :character  
##  Mode  :character   Q:123    Mode  :character  
##                     S:914                      
##                                                
##                                                
##

str(full)

## 'data.frame':    1309 obs. of  12 variables:
##  $ pclass  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived: chr  "사망" "생존" "생존" "생존" ...
##  $ name    : chr  "Allen, Miss. Elisabeth Walton" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" "Allison, Mrs. Hudson J C (Bessie Waldo Daniels)" ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 1 2 1 2 1 2 2 1 1 ...
##  $ age     : num  29 2 30 25 39 53 71 47 18 24 ...
##  $ sibsp   : int  0 1 1 1 0 2 0 1 1 0 ...
##  $ parch   : int  0 2 2 2 0 0 0 0 0 0 ...
##  $ ticket  : chr  "24160" "113781" "113781" "113781" ...
##  $ fare    : num  211 152 152 152 0 ...
##  $ cabin   : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked: Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 2 2 2 2 ...
##  $ index   : chr  "train" "train" "train" "train" ...

levels(full$embarked)[1]<-NA
table(full$embarked,useNA="always")

## 
##    C    Q    S <NA> 
##  270  123  914    2

str(full)

## 'data.frame':    1309 obs. of  12 variables:
##  $ pclass  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived: chr  "사망" "생존" "생존" "생존" ...
##  $ name    : chr  "Allen, Miss. Elisabeth Walton" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" "Allison, Mrs. Hudson J C (Bessie Waldo Daniels)" ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 1 2 1 2 1 2 2 1 1 ...
##  $ age     : num  29 2 30 25 39 53 71 47 18 24 ...
##  $ sibsp   : int  0 1 1 1 0 2 0 1 1 0 ...
##  $ parch   : int  0 2 2 2 0 0 0 0 0 0 ...
##  $ ticket  : chr  "24160" "113781" "113781" "113781" ...
##  $ fare    : num  211 152 152 152 0 ...
##  $ cabin   : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 1 1 1 1 ...
##  $ index   : chr  "train" "train" "train" "train" ...

full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))

##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0        0        0        0        0 
##     fare    cabin embarked    index 
##        0        0        0        0

library(recipes)

## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>% 
  step_center(age,sibsp,parch,fare) %>% 
  step_scale(age,sibsp,parch,fare) %>% 
  prep() %>% juice()->data

full %>% filter(index=="train") %>% select(-index,-name,-ticket,-cabin)->train
full %>% filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test

ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
                   classProbs = TRUE)
train(survived~.,data=train,
      method="rpart",metric='ROC',
      trControl=ctrl)->rffit

rffit

## CART 
## 
## 720 samples
##   7 predictor
##   2 classes: '사망', '생존' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 647, 648, 648, 648, 649, 649, ... 
## Resampling results across tuning parameters:
## 
##   cp          ROC        Sens       Spec     
##   0.01797386  0.7923234  0.6867742  0.8524971
##   0.02287582  0.7751240  0.6767742  0.8476771
##   0.48692810  0.5863147  0.2451613  0.9274681
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01797386.

predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2
head(rffit2)

## [1] 사망 생존 사망 사망 생존 사망
## Levels: 사망 생존

levels(rffit2)

## [1] "사망" "생존"

levels(test$survived)

## NULL

str(test$survived)

##  chr [1:323] "사망" "사망" "사망" "사망" "사망" "사망" "사망" "사망" "생존" ...

test$survived<-as.factor(test$survived)
confusionMatrix(rffit2,test$survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 사망 생존
##       사망   89   38
##       생존   30  166
##                                           
##                Accuracy : 0.7895          
##                  95% CI : (0.7409, 0.8326)
##     No Information Rate : 0.6316          
##     P-Value [Acc > NIR] : 6.435e-10       
##                                           
##                   Kappa : 0.5539          
##                                           
##  Mcnemar's Test P-Value : 0.396           
##                                           
##             Sensitivity : 0.7479          
##             Specificity : 0.8137          
##          Pos Pred Value : 0.7008          
##          Neg Pred Value : 0.8469          
##              Prevalence : 0.3684          
##          Detection Rate : 0.2755          
##    Detection Prevalence : 0.3932          
##       Balanced Accuracy : 0.7808          
##                                           
##        'Positive' Class : 사망            
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 다음의 패키지를 부착합니다: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rffit2_num<-as.numeric(rffit2)
rffit2_num

##   [1] 1 2 1 1 2 1 1 1 2 1 2 2 1 2 1 2 1 2 2 1 2 2 1 1 1 2 1 2 1 2 1 1 2 1 2 1 1
##  [38] 1 1 2 2 2 2 1 2 2 1 2 2 1 1 2 2 2 1 1 2 2 2 2 1 1 2 2 1 1 2 2 2 1 2 1 2 1
##  [75] 1 2 2 1 1 2 2 2 2 2 1 1 2 2 1 2 1 1 1 2 1 1 2 1 2 2 1 2 1 2 2 2 2 1 2 2 2
## [112] 1 1 2 2 2 2 1 2 2 1 1 1 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1 1 2 1 1 2 1 1 2 1
## [149] 2 2 2 1 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 1 1 1 1 1 2 1 2 2 2 2 2 1 2 1 2 1 1
## [186] 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 2 2 2 2 2 2 2 1 1 2 2 1 2 1 2 2 2 2 2
## [223] 2 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2 1 1 2 1 1 2 2 2
## [260] 1 2 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 1 2 2 2 2 1 1 1 2 2 2 1 1 2 2 2 2 2
## [297] 2 1 2 2 1 2 2 2 1 2 2 2 2 1 1 2 2 2 1 2 2 1 2 2 2 1 1

result1<-pROC::roc(test$survived,rffit2_num)

## Setting levels: control = 사망, case = 생존

## Setting direction: controls < cases

result1

## 
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
## 
## Data: rffit2_num in 119 controls (test$survived 사망) < 204 cases (test$survived 생존).
## Area under the curve: 0.7808

bigdata_publish_rmarkdown_part04_02

kim kye chul

2022 5 20