BYEON, Inseop
2023-11-28
x <- c("tidyverse", "ggplot2","lubridate", "DescTools")
lapply(x,library,character.only=TRUE)
task1 <- read.delim("유지율 예측용 기초자료_1.txt")
task2 <- read.delim("유지율 예측용 기초자료_2.txt",fileEncoding="euc-kr")
task3 <- read.delim("유지율 예측용 기초자료_3.txt",fileEncoding="euc-kr")
task4 <- read.delim("유지율 예측용 기초자료_4.txt",fileEncoding="euc-kr")
task <- rbind(task1,task2,task3,task4)
task <- subset(task,nchar(계약자주민번호)!=6 & 계약일자>=20200101)
colnames(task,5)
task <- subset(task,nchar(계약자주민번호)!=6 & 계약일자>=20180101)
task$계약일자 = as.character(task$계약일자) %>% as.Date(format="%Y%m%d")
task <- subset(task,계약일자 <= max(task$계약일자) %m-% months(13))
task$최종입금일자 = as.character(task$최종입금일자) %>% as.Date(format="%Y%m%d")
task$소멸일자 = as.character(task$소멸일자) %>% as.Date(format="%Y%m%d")
task$소멸일자[is.na(task$소멸일자)] <- '9999-12-31'
task$해촉일 = as.character(task$해촉일) %>% as.Date(format="%Y%m%d")
task$해촉일[is.na(task$해촉일)] <- '9999-12-31'
task$계약자성별 <- ifelse(nchar(task$계약자주민번호)==10, "N" ,ifelse(as.numeric(substr(task$계약자주민번호,7,7))%%2==0, "F", "M")) # M:남성/F:여성/N:법인
task$해촉영향 <- with(task,ifelse(해촉일<=소멸일자, 1, 0))
task$유지over13 <- with(task, ifelse(납입주기=="월납" & 최종납입회차>=13, 1,
ifelse(납입주기 =="3개월납" & 최종납입회차 >=5, 1,
ifelse(납입주기 == "6개월납" & 최종납입회차 >=3, 1,
ifelse(납입주기 == "연납" & 최종납입회차 >=2, 1,
ifelse(납입주기 == "일시납" & 소멸일자 >= (계약일자 %m+% months(13)), 1, 0))))) )
task$유지over25 <- with(task, ifelse(납입주기=="월납" & 최종납입회차>=25, 1,
ifelse(납입주기 =="3개월납" & 최종납입회차 >=9, 1,
ifelse(납입주기 == "6개월납" & 최종납입회차 >=5, 1,
ifelse(납입주기 == "연납" & 최종납입회차 >=3, 1,
ifelse(납입주기 == "일시납" & 소멸일자 >= (계약일자 %m+% months(25)), 1, 0))))) )
task %>% with(Desc(년기준))
# Through trials and errors, I knew that it's required to transform yet again as below
# scorecard::woebin(input, y="churn13", x=c("period"),positive=1, method="tree",count_distr_limit = 0.05,bin_num_limit = 6, save_breaks_list = "input_bin_count")
task <- task %>% mutate(년기준_new = ifelse(년기준 < 1, 1,
ifelse(년기준 < 10, 2,
ifelse(년기준 < 16, 3, 4))))
task$약관대출잔액[is.na(task$약관대출잔액)] <- 0
task %>% with(Desc(약관대출잔액))
# Through trials and errors, I knew that it's required to transform yet again as below
# scorecard::woebin(input, y="churn13", x=c("loan_residual"),positive=1, method="tree",count_distr_limit = 0.05,bin_num_limit = 6, save_breaks_list = "input_bin_count")
task$약관대출잔액_new <- ifelse(task$약관대출잔액>0,1,0)
year1 <- substr(task$계약자주민번호, 1, 2)
month <- substr(task$계약자주민번호, 3, 4)
day <- substr(task$계약자주민번호, 5, 6)
year2 <- ifelse(as.numeric(year1) >= 0 & as.numeric(year1)<= 23, paste0("20", year1), paste0("19", year1))
age <- year(task$계약일자) - as.numeric(year2) + ifelse(month(task$계약일자) < as.numeric(month) |
(month(task$계약일자) == as.numeric(month) & day(task$계약일자) < as.numeric(day)), -1, 0)
mean_age <- round(mean(age,na.rm=TRUE),0)
task$계약자나이 <- ifelse(nchar(task$계약자주민번호)==10,mean_age,age)
task %>% with(Desc(계약자나이))
# Through trials and errors, I knew that it's required to transform yet again as below
# scorecard::woebin(input, y="churn13", x=c("ctrt_age"),positive=1, method="tree",count_distr_limit = 0.05,bin_num_limit = 6, save_breaks_list = "input_bin_count")
task <- task %>% mutate(계약자나이_new = ifelse(계약자나이 < 32, 1,
ifelse(계약자나이 < 52, 2,
ifelse(계약자나이 < 56, 3, 4))))
pay_cycle <- c("월납","3개월납","6개월납","연납","일시납")
pay_cycle_as_num <- c(1,3,6,12,50)
(pay_cycle_table1 <- data.frame(pay_cycle,pay_cycle_as_num))
temp <- as.data.frame(task$납입주기)
colnames(temp) <- "pay_cycle"
pay_cycle_table2 <- left_join(temp,pay_cycle_table1);pay_cycle_table2
task$변환보험료 <- unlist(task$최종합계보험료/pay_cycle_table2[2])
task %>% with(Desc(변환보험료))
# Through trials and errors, I knew that it's required to transform yet again as below
# scorecard::woebin(input, y="churn13", x=c("premium_trans"),positive=1, method="tree",count_distr_limit = 0.05,bin_num_limit = 6, save_breaks_list = "input_bin_count")
task <- task %>% mutate(변환보험료_new = ifelse(변환보험료 < 120000, 1,
ifelse(변환보험료 < 200000, 2,
ifelse(변환보험료 < 220000, 3,
ifelse(변환보험료 < 460000, 4,
ifelse(변환보험료 < 600000, 5, 6))))))
task$합계보험료[is.na(task$합계보험료)] <- 0
task$감액 <- with(task, ifelse(합계보험료>최종합계보험료, unlist((합계보험료-최종합계보험료)/pay_cycle_table2[2]), 0))
task %>% with(Desc(감액))
# Through trials and errors, I knew that it's required to transform yet again as below
# scorecard::woebin(input, y="churn13", x=c("resize"),positive=1, method="tree",count_distr_limit = 0.05,bin_num_limit = 6, save_breaks_list = "input_bin_count")
task$감액_new <- with(task, ifelse(합계보험료>최종합계보험료, 1, 0))
input <- subset(task,select=c("모집채널",
"납입주기",
"년기준_new",
"생명보험협회상품종류",
"약관대출잔액_new",
"계약자성별",
"계약자나이_new",
"해촉영향",
"변환보험료_new",
"감액_new",
"유지over13",
"유지over25"))
input <- input %>% rename(channel="모집채널",
cycle="납입주기",
period="년기준_new",
sort="생명보험협회상품종류",
loan_residual="약관대출잔액_new",
ctrt_gender="계약자성별",
ctrt_age="계약자나이_new",
if_resign="해촉영향",
premium_trans="변환보험료_new",
resize="감액_new",
churn13="유지over13",
churn25="유지over25")
input$channel <- factor(input$channel)
input$cycle <- factor(input$cycle)
input$period <- factor(input$period)
input$sort <- factor(input$sort)
input$loan_residual <- factor(input$loan_residual)
input$ctrt_gender <- factor(input$ctrt_gender)
input$ctrt_age <- factor(input$ctrt_age)
input$if_resign <- factor(input$if_resign)
input$premium_trans <- factor(input$premium_trans)
input$resize <- factor(input$resize)
input$churn13 <- factor(input$churn13)
input$churn25 <- factor(input$churn25)
input %>% ggplot(aes(x=channel, fill=channel)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "모집채널", y="percent", fill ="모집채널")
input %>% ggplot(aes(x=cycle, fill=cycle)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "납입주기", y="percent", fill ="납입주기")
input %>% ggplot(aes(x=period, fill=period)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "납입기간", y="percent", fill ="납입기간")
input %>% ggplot(aes(x=sort, fill=sort)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "생명보험협회상품종류", y="percent", fill ="생명보험협회상품종류")
input %>% ggplot(aes(x=loan_residual, fill=loan_residual)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "약관대출잔액", y="percent", fill ="약관대출잔액")
input %>% ggplot(aes(x=ctrt_gender, fill=ctrt_gender)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "계약자성별", y="percent", fill ="계약자성별")
input %>% ggplot(aes(x=ctrt_age, fill=ctrt_age)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "계약자나이", y="percent", fill ="계약자나이")
input %>% ggplot(aes(x=if_resign, fill=if_resign)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "해촉영향", y="percent", fill ="해촉영향")
input %>% ggplot(aes(x=premium_trans, fill=premium_trans)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "변환보험료", y="percent", fill ="변환보험료")
input %>% ggplot(aes(x=resize, fill=resize)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "보험금액 감액", y="percent", fill ="보험금액 감액")
input %>% ggplot(aes(x=churn13, fill=churn13)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "13회차유지", y="percent", fill ="13회차유지")
input %>% ggplot(aes(x=churn25, fill=churn25)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
labs(x = "25회차유지", y="percent", fill ="25회차유지")
library(remotes)
install_github("tomasgreif/woe")
library(woe)
iv.all <- iv.mult(df=subset(input,select=-c(churn25)), y="churn13", summary=TRUE)
iv.plot.summary(iv.all)
set.seed(111)
index <- sample(x=c(TRUE,FALSE), size=NROW(input), replace=TRUE, prob=c(0.8,0.2))
train <- input[index,]
test <- input[!index,]
str(train)
model_logistic <- glm(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, family=binomial(link='logit'))
step(model_logistic, direction="both")
library(ROCR)
library(caret)
test_prediction <- predict(model_logistic, newdata = test, type='response')
pred_test <- prediction(predictions = as.numeric(test_prediction),labels=as.numeric(test$churn13))
auc_test <- performance(prediction.obj = pred_test,measure='auc')
auc_value_test=unlist(slot(auc_test,'y.values'))
paste(c('AUC='),round(auc_value_test,6),sep="")
perf_test <- performance(pred_test,measure='tpr', x.measure='fpr')
plot(perf_test)
confusionMatrix(factor(ifelse(test_prediction>.5, 1, 0)), factor(as.numeric(test$churn13)-1))
library(rpart)
CARTmodel <- rpart(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, control=rpart.control(minsplit=5))
test_prediction <- predict(CARTmodel, newdata = test, type='prob')
confusionMatrix(factor(ifelse(test_prediction[,2]>.5, 1, 0)), factor(as.numeric(test$churn13)-1))
library(rpart.plot)
rpart.plot::prp(CARTmodel, type=4, extra=2,digits=3)
library(adabag)
bagging_model <- bagging(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, mfinal=100)
test_prediction <- predict(bagging_model, newdata = test)
confusionMatrix(factor(test_prediction$class), factor(as.numeric(test$churn13)-1))
library(adabag)
boosting_model <- boosting(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, boos=TRUE, mfinal=30)
test_prediction <- predict(boosting_model, newdata = test)
confusionMatrix(factor(test_prediction$class), factor(as.numeric(test$churn13)-1))
library(randomForest)
forest_model <- randomForest(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, ntree=100, mtry=5)
test_prediction <- predict(forest_model, newdata = test, type='prob')
confusionMatrix(factor(ifelse(test_prediction[,2]>.5, 1, 0)), factor(as.numeric(test$churn13)-1))
library(nnet)
nn_model <- nnet(churn13 ~ channel + cycle + period + sort + loan_residual + ctrt_gender + ctrt_age + if_resign + premium_trans + resize,
data=train, size=5, rang=0.5, decay=5e-4, maxit=1000)
test_prediction <- predict(nn_model, newdata = test, type='raw')
confusionMatrix(factor(ifelse(test_prediction>.5, 1, 0)), factor(as.numeric(test$churn13)-1))
| 모델명 | Accuracy | Sensitivity | Specificity |
|---|---|---|---|
| 로지스틱회귀 | 0.7721 | 0.6737 | 0.8518 |
| 의사결정나무 | 0.798 | 0.6746 | 0.8978 |
| 배깅 | 0.798 | 0.6746 | 0.8978 |
| 부스팅 | 0.7985 | 0.6764 | 0.8974 |
| 랜덤포레스트 | 0.803 | 0.6834 | 0.8998 |
| 인공신경망 | 0.8017 | 0.6826 | 0.8981 |
-끝-