rm(list=ls())
setwd("c:/R")
ls()
## character(0)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(recipes)
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
df<-read.csv("travel_data.csv")
df %>% glimpse
## Rows: 1,987
## Columns: 10
## $ INDEX <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ~
## $ Age <int> 31, 31, 34, 28, 28, 25, 31, 31, 28, 33, 31, 26, 32~
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed~
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Y~
## $ AnnualIncome <int> 400000, 1250000, 500000, 700000, 700000, 1150000, ~
## $ FamilyMembers <int> 6, 7, 4, 3, 8, 4, 4, 3, 6, 3, 9, 5, 6, 6, 3, 7, 4,~
## $ ChronicDiseases <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,~
## $ FrequentFlyer <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", ~
## $ EverTravelledAbroad <chr> "No", "No", "No", "No", "No", "No", "No", "Yes", "~
## $ TravelInsurance <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,~
View(df)
set.seed(1357)
sample(1:46,6)
## [1] 12 9 26 7 4 44
train_list<-createDataPartition(y=df$TravelInsurance,p=0.75,list=FALSE)
class(train_list)
## [1] "matrix" "array"
head(train_list)
## Resample1
## [1,] 3
## [2,] 4
## [3,] 5
## [4,] 8
## [5,] 9
## [6,] 10
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 1491
NROW(df_test)
## [1] 496
df_train %>% glimpse
## Rows: 1,491
## Columns: 10
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, ~
## $ Age <int> 34, 28, 28, 31, 28, 33, 31, 26, 31, 28, 28, 29, 34~
## $ Employment.Type <chr> "Private Sector/Self Employed", "Private Sector/Se~
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "~
## $ AnnualIncome <int> 500000, 700000, 700000, 1350000, 1450000, 800000, ~
## $ FamilyMembers <int> 4, 3, 8, 3, 6, 3, 9, 5, 6, 4, 7, 5, 2, 6, 3, 4, 9,~
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <chr> "No", "No", "Yes", "Yes", "Yes", "Yes", "No", "Yes~
## $ EverTravelledAbroad <chr> "No", "No", "No", "Yes", "Yes", "No", "No", "Yes",~
## $ TravelInsurance <int> 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,~
df_test %>% glimpse
## Rows: 496
## Columns: 10
## $ INDEX <int> 0, 1, 5, 6, 12, 14, 15, 27, 33, 37, 38, 39, 43, 46~
## $ Age <int> 31, 31, 25, 31, 32, 31, 34, 28, 32, 34, 34, 33, 28~
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed~
## $ GraduateOrNot <chr> "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Y~
## $ AnnualIncome <int> 400000, 1250000, 1150000, 1300000, 850000, 400000,~
## $ FamilyMembers <int> 6, 7, 4, 4, 6, 3, 7, 2, 3, 4, 2, 5, 4, 4, 3, 3, 9,~
## $ ChronicDiseases <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <chr> "No", "No", "No", "No", "No", "No", "No", "Yes", "~
## $ EverTravelledAbroad <chr> "No", "No", "No", "No", "No", "No", "No", "No", "N~
## $ TravelInsurance <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,~
df_train %>% mutate(index="train")->df_train
df_test %>% mutate(index="test")->df_test
bind_rows(df_train,df_test)->full
full %>% glimpse
## Rows: 1,987
## Columns: 11
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, ~
## $ Age <int> 34, 28, 28, 31, 28, 33, 31, 26, 31, 28, 28, 29, 34~
## $ Employment.Type <chr> "Private Sector/Self Employed", "Private Sector/Se~
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "~
## $ AnnualIncome <int> 500000, 700000, 700000, 1350000, 1450000, 800000, ~
## $ FamilyMembers <int> 4, 3, 8, 3, 6, 3, 9, 5, 6, 4, 7, 5, 2, 6, 3, 4, 9,~
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <chr> "No", "No", "Yes", "Yes", "Yes", "Yes", "No", "Yes~
## $ EverTravelledAbroad <chr> "No", "No", "No", "Yes", "Yes", "No", "No", "Yes",~
## $ TravelInsurance <int> 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,~
## $ index <chr> "train", "train", "train", "train", "train", "trai~
full$TravelInsurance<-ifelse(full$TravelInsurance==0,"미가입","가입")
full$GraduateOrNot<-as.factor(full$GraduateOrNot)
full$FrequentFlyer<-as.factor(full$FrequentFlyer)
full$EverTravelledAbroad<-as.factor(full$EverTravelledAbroad)
full %>% glimpse
## Rows: 1,987
## Columns: 11
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, ~
## $ Age <int> 34, 28, 28, 31, 28, 33, 31, 26, 31, 28, 28, 29, 34~
## $ Employment.Type <chr> "Private Sector/Self Employed", "Private Sector/Se~
## $ GraduateOrNot <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, ~
## $ AnnualIncome <int> 500000, 700000, 700000, 1350000, 1450000, 800000, ~
## $ FamilyMembers <int> 4, 3, 8, 3, 6, 3, 9, 5, 6, 4, 7, 5, 2, 6, 3, 4, 9,~
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <fct> No, No, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, ~
## $ EverTravelledAbroad <fct> No, No, No, Yes, Yes, No, No, Yes, Yes, No, No, No~
## $ TravelInsurance <chr> "가입", "미가입", "미가입", "가입", "가입", "미가~
## $ index <chr> "train", "train", "train", "train", "train", "trai~
colSums(is.na(full))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance index
## 0 0 0
summary(is.na(full))
## INDEX Age Employment.Type GraduateOrNot
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1987 FALSE:1987 FALSE:1987 FALSE:1987
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1987 FALSE:1987 FALSE:1987 FALSE:1987
## EverTravelledAbroad TravelInsurance index
## Mode :logical Mode :logical Mode :logical
## FALSE:1987 FALSE:1987 FALSE:1987
recipe(TravelInsurance~.,data=full) %>%
step_YeoJohnson(Age,AnnualIncome,FamilyMembers) %>%
step_center(Age,AnnualIncome,FamilyMembers) %>%
step_scale(Age,AnnualIncome,FamilyMembers) %>% prep() %>% juice()->data
data %>% glimpse
## Rows: 1,987
## Columns: 11
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, ~
## $ Age <dbl> 1.4182229, -0.5150177, -0.5150177, 0.5290744, -0.5~
## $ Employment.Type <fct> Private Sector/Self Employed, Private Sector/Self ~
## $ GraduateOrNot <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, ~
## $ AnnualIncome <dbl> -1.16128743, -0.56552847, -0.56552847, 1.08672642,~
## $ FamilyMembers <dbl> -0.3760749, -1.1525825, 1.7698222, -1.1525825, 0.8~
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <fct> No, No, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, ~
## $ EverTravelledAbroad <fct> No, No, No, Yes, Yes, No, No, Yes, Yes, No, No, No~
## $ index <fct> train, train, train, train, train, train, train, t~
## $ TravelInsurance <fct> 가입, 미가입, 미가입, 가입, 가입, 미가입, 미가입, ~
data %>% filter(index=="train") %>% select(-index)->train
data %>% filter(index=="test") %>% select(-index)->test
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary
,classProbs=TRUE)
train(TravelInsurance~.,data=train,
method='rf',metric="ROC",
trControl=ctrl)->rpfit
rpfit
## Random Forest
##
## 1491 samples
## 9 predictor
## 2 classes: '가입', '미가입'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1342, 1342, 1342, 1341, 1342, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7859345 0.5807692 0.9691563
## 5 0.7854082 0.6096154 0.9320745
## 9 0.7828484 0.5923077 0.9155902
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(rpfit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 가입 미가입
## 가입 20.3 2.0
## 미가입 14.6 63.1
##
## Accuracy (average) : 0.8337
test %>% glimpse
## Rows: 496
## Columns: 10
## $ INDEX <int> 0, 1, 5, 6, 12, 14, 15, 27, 33, 37, 38, 39, 43, 46~
## $ Age <dbl> 0.5290744, 0.5290744, -1.7617299, 0.5290744, 0.840~
## $ Employment.Type <fct> Government Sector, Private Sector/Self Employed, P~
## $ GraduateOrNot <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y~
## $ AnnualIncome <dbl> -1.4862039, 0.8519524, 0.6113603, 0.9700295, -0.15~
## $ FamilyMembers <dbl> 0.8341005, 1.3278276, -0.3760749, -0.3760749, 0.83~
## $ ChronicDiseases <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,~
## $ FrequentFlyer <fct> No, No, No, No, No, No, No, Yes, No, No, No, Yes, ~
## $ EverTravelledAbroad <fct> No, No, No, No, No, No, No, No, No, No, No, Yes, N~
## $ TravelInsurance <fct> 미가입, 미가입, 미가입, 미가입, 가입, 미가입, 미가~
predict(rpfit,test,type='prob')->rffit1
predict(rpfit,test,type='raw')->rffit2
head(rffit1)
## 가입 미가입
## 1 0.174 0.826
## 2 0.356 0.644
## 3 0.308 0.692
## 4 0.232 0.768
## 5 0.366 0.634
## 6 0.210 0.790
head(rffit2)
## [1] 미가입 미가입 미가입 미가입 미가입 미가입
## Levels: 가입 미가입
confusionMatrix(rffit2,test$TravelInsurance)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 가입 미가입
## 가입 115 8
## 미가입 75 298
##
## Accuracy : 0.8327
## 95% CI : (0.7968, 0.8645)
## No Information Rate : 0.6169
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6206
##
## Mcnemar's Test P-Value : 4.342e-13
##
## Sensitivity : 0.6053
## Specificity : 0.9739
## Pos Pred Value : 0.9350
## Neg Pred Value : 0.7989
## Prevalence : 0.3831
## Detection Rate : 0.2319
## Detection Prevalence : 0.2480
## Balanced Accuracy : 0.7896
##
## 'Positive' Class : 가입
##
importance<-varImp(rpfit,scale=FALSE)
print(importance)
## rf variable importance
##
## Overall
## AnnualIncome 153.644
## INDEX 76.091
## Age 63.104
## EverTravelledAbroadYes 51.959
## FamilyMembers 51.206
## FrequentFlyerYes 16.677
## ChronicDiseases 10.322
## Employment.TypePrivate Sector/Self Employed 9.108
## GraduateOrNotYes 7.653
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rffit2_num<-as.numeric(rffit2)
rffit2_num
## [1] 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 1 1 2
## [38] 2 2 1 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2
## [75] 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 2 1 2 2 2 1 2 2 2 2 1 1 1 2 2 2 2 2
## [112] 2 2 1 2 1 2 1 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2
## [149] 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 1 2 2 2
## [186] 2 2 2 1 2 2 2 2 1 2 2 1 2 2 2 2 1 1 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 1 1 2 2 2 2 1 2 2 1 1 2 2 1 1 2 2 1 2 2 2 2 1 1 1 2 2 2 2 1 2 2 1 1 1 2
## [260] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 1 2 2 1 2 2
## [297] 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2
## [334] 1 1 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 1 1 2 1 2 1 2 1 2 2
## [371] 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2
## [408] 2 1 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 1 2 1 1 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2
## [445] 1 2 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1
## [482] 1 2 2 2 2 2 2 1 2 2 2 2 1 2 1
result<-roc(test$TravelInsurance,rffit2_num)
## Setting levels: control = 가입, case = 미가입
## Setting direction: controls < cases
result$auc
## Area under the curve: 0.7896