# 작업형 기출2
# 보험 가입 확률을 묻는 문제
# index(id) 컬럼 포함 총 10개의 컬럼으로 되어있으며,
# train 데이터로 1491건, test 데이터로 496건의 자료를 제공
# 데이터에 index 컬럼이 숫자형으로 되어있어 모델에 포함하여 index 변수를
# 활용하지 않음
rm(list=ls())
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
# 1 데이터 구조 파악, 데이터 분할
df<-read.csv("travel_data.csv")
# train/test: 0.75:0.25
train_list<-createDataPartition(y=df$TravelInsurance,p=0.75,list=FALSE)
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 1491
NROW(df_test)
## [1] 496
df_train %>% glimpse
## Rows: 1,491
## Columns: 10
## $ INDEX <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, ~
## $ Age <int> 31, 31, 34, 28, 28, 25, 31, 31, 28, 33, 31, 26, 31~
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed~
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Y~
## $ AnnualIncome <int> 400000, 1250000, 500000, 700000, 700000, 1150000, ~
## $ FamilyMembers <int> 6, 7, 4, 3, 8, 4, 4, 3, 6, 3, 9, 5, 6, 3, 4, 2, 6,~
## $ ChronicDiseases <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,~
## $ FrequentFlyer <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", ~
## $ EverTravelledAbroad <chr> "No", "No", "No", "No", "No", "No", "No", "Yes", "~
## $ TravelInsurance <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,~
df_test %>% glimpse
## Rows: 496
## Columns: 10
## $ INDEX <int> 12, 15, 17, 18, 21, 26, 34, 36, 37, 48, 51, 53, 54~
## $ Age <int> 32, 34, 28, 29, 29, 34, 28, 31, 34, 28, 29, 28, 29~
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed~
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "N~
## $ AnnualIncome <int> 850000, 700000, 800000, 1050000, 350000, 1300000, ~
## $ FamilyMembers <int> 6, 7, 7, 5, 3, 6, 9, 9, 4, 3, 3, 2, 7, 6, 4, 5, 4,~
## $ ChronicDiseases <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,~
## $ FrequentFlyer <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "~
## $ EverTravelledAbroad <chr> "No", "No", "No", "No", "No", "No", "No", "No", "N~
## $ TravelInsurance <int> 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,~
# 실제 시험에서는 INDEX 의미없는 숫자
df_train %>% mutate(index="train")->df_train
df_test %>% mutate(index='test')->df_test
bind_rows(df_train,df_test)->full
full %>% head
## INDEX Age Employment.Type GraduateOrNot AnnualIncome
## 1 0 31 Government Sector Yes 400000
## 2 1 31 Private Sector/Self Employed Yes 1250000
## 3 2 34 Private Sector/Self Employed Yes 500000
## 4 3 28 Private Sector/Self Employed Yes 700000
## 5 4 28 Private Sector/Self Employed Yes 700000
## 6 5 25 Private Sector/Self Employed No 1150000
## FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad
## 1 6 1 No No
## 2 7 0 No No
## 3 4 1 No No
## 4 3 1 No No
## 5 8 1 Yes No
## 6 4 0 No No
## TravelInsurance index
## 1 0 train
## 2 0 train
## 3 1 train
## 4 0 train
## 5 0 train
## 6 0 train
# 2 목표변수, 기타변수 변환
full$TravelInsurance<-ifelse(full$TravelInsurance==0,"미가입","가입")
full$TravelInsurance<-as.factor(full$TravelInsurance)
full$GraduateOrNot<-as.factor(full$GraduateOrNot)
full$FrequentFlyer<-as.factor(full$FrequentFlyer)
full$EverTravelledAbroad<-as.factor(full$EverTravelledAbroad)
# 3 결측값 확인
colSums(is.na(full))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance index
## 0 0 0
# 4 데이터 전처리
recipe(TravelInsurance~.,data=full) %>% step_YeoJohnson(Age,AnnualIncome,FamilyMembers) %>%
step_center(Age,AnnualIncome,FamilyMembers) %>%
step_scale(Age,AnnualIncome,FamilyMembers) %>% prep() %>% juice()->data
data %>%filter(index=="train") %>% select(-index)->train
data %>%filter(index=='test') %>% select(-index)->test
ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
classProbs = TRUE)
train(TravelInsurance~.,data=train,
method='rpart',metric="ROC",
trControl=ctrl)->rpfit
rpfit
## CART
##
## 1491 samples
## 9 predictor
## 2 classes: '가입', '미가입'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1342, 1342, 1342, 1342, 1341, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.003838772 0.7939687 0.5951016 0.9628866
## 0.062380038 0.7460790 0.5200653 0.9659794
## 0.414587332 0.5558684 0.1230769 0.9886598
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.003838772.
# rpart cp complexity parameter 이 포인트에서 분할가지수를 선택한다.
confusionMatrix(rpfit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 가입 미가입
## 가입 20.8 2.4
## 미가입 14.2 62.6
##
## Accuracy (average) : 0.8343
test %>% glimpse
## Rows: 496
## Columns: 10
## $ INDEX <int> 12, 15, 17, 18, 21, 26, 34, 36, 37, 48, 51, 53, 54~
## $ Age <dbl> 0.8407842, 1.4182229, -0.5150177, -0.1471917, -0.1~
## $ Employment.Type <fct> Government Sector, Private Sector/Self Employed, P~
## $ GraduateOrNot <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, No, Yes, Yes, Ye~
## $ AnnualIncome <dbl> -0.1528386, -0.5655285, -0.2877223, 0.3642791, -1.~
## $ FamilyMembers <dbl> 0.8341005, 1.3278276, 1.3278276, 0.2737135, -1.152~
## $ ChronicDiseases <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,~
## $ FrequentFlyer <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, N~
## $ EverTravelledAbroad <fct> No, No, No, No, No, No, No, No, No, No, No, No, No~
## $ TravelInsurance <fct> 가입, 미가입, 가입, 가입, 가입, 가입, 가입, 미가입~
predict(rpfit,test,type='prob')->rffit1
predict(rpfit,test,type="raw")->rffit2
confusionMatrix(rffit2,test$TravelInsurance)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 가입 미가입
## 가입 103 9
## 미가입 86 298
##
## Accuracy : 0.8085
## 95% CI : (0.771, 0.8422)
## No Information Rate : 0.619
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5595
##
## Mcnemar's Test P-Value : 6.318e-15
##
## Sensitivity : 0.5450
## Specificity : 0.9707
## Pos Pred Value : 0.9196
## Neg Pred Value : 0.7760
## Prevalence : 0.3810
## Detection Rate : 0.2077
## Detection Prevalence : 0.2258
## Balanced Accuracy : 0.7578
##
## 'Positive' Class : 가입
##
importance<-varImp(rpfit,scale=FALSE)
print(importance)
## rpart variable importance
##
## Overall
## AnnualIncome 211.2755
## EverTravelledAbroadYes 131.1675
## FamilyMembers 65.3992
## Age 49.5210
## FrequentFlyerYes 36.3936
## Employment.TypePrivate Sector/Self Employed 18.5065
## INDEX 2.5086
## GraduateOrNotYes 0.5929
## ChronicDiseases 0.0000
## `Employment.TypePrivate Sector/Self Employed` 0.0000
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rffit2_num<-as.numeric(rffit2)
result<-roc(test$TravelInsurance,rffit2_num)
## Setting levels: control = 가입, case = 미가입
## Setting direction: controls < cases
result
##
## Call:
## roc.default(response = test$TravelInsurance, predictor = rffit2_num)
##
## Data: rffit2_num in 189 controls (test$TravelInsurance 가입) < 307 cases (test$TravelInsurance 미가입).
## Area under the curve: 0.7578
result$auc
## Area under the curve: 0.7578