rm(list=ls())
getwd()
## [1] "C:/R"
setwd("c:/R")
ls()
## character(0)
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
x_test<-read.csv('X_test.csv')
x_train<-read.csv('X_train.csv')
y_train<-read.csv('y_train.csv')

x_train %>% glimpse 
## Rows: 3,500
## Columns: 10
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산~
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
y_train %>% glimpse
## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender  <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,~
x_test %>% glimpse
## Rows: 2,482
## Columns: 10
## $ cust_id        <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액       <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액     <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액       <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품     <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점     <chr> "부산본점", "잠실점", "본  점", "부산본점", "청량리점",~
## $ 내점일수       <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율   <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기       <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
left_join(x_train,y_train,by="cust_id") %>% mutate(index="train")->train
train %>% glimpse
## Rows: 3,500
## Columns: 12
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산~
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender         <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index          <chr> "train", "train", "train", "train", "train", "train", "~
x_test %>% mutate(index='test')->test
test %>% glimpse
## Rows: 2,482
## Columns: 11
## $ cust_id        <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액       <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액     <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액       <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품     <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점     <chr> "부산본점", "잠실점", "본  점", "부산본점", "청량리점",~
## $ 내점일수       <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율   <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기       <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
## $ index          <chr> "test", "test", "test", "test", "test", "test", "test",~
bind_rows(train,test)->full
full %>% glimpse
## Rows: 5,982
## Columns: 12
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산~
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender         <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index          <chr> "train", "train", "train", "train", "train", "train", "~
full$gender<-as.factor(full$gender)
full$gender<-ifelse(full$gender==0,"여성","남성")
colSums(is.na(full))
##        cust_id       총구매액     최대구매액       환불금액     주구매상품 
##              0              0              0           3906              0 
##     주구매지점       내점일수 내점당구매건수   주말방문비율       구매주기 
##              0              0              0              0              0 
##         gender          index 
##           2482              0
full$환불금액<-ifelse(is.na(full$환불금액),0,full$환불금액)
names(full)
##  [1] "cust_id"        "총구매액"       "최대구매액"     "환불금액"      
##  [5] "주구매상품"     "주구매지점"     "내점일수"       "내점당구매건수"
##  [9] "주말방문비율"   "구매주기"       "gender"         "index"
full %>% rename(total="총구매액",
                max="최대구매액",
                refund="환불금액",
                product="주구매상품",
                store="주구매지점",
                day="내점일수",
                count="내점당구매건수",
                week="주말방문비율",
                cycle="구매주기") %>% 
  select(cust_id,gender,index,total,max,refund,product,store,day,count,week,cycle)->data
data %>% glimpse()
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender  <chr> "여성", "여성", "남성", "남성", "여성", "여성", "여성", "여성"~
## $ index   <chr> "train", "train", "train", "train", "train", "train", "train",~
## $ total   <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 11379000, 1005~
## $ max     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000, 761200~
## $ refund  <dbl> 6860000, 300000, 0, 0, 0, 462000, 4582000, 29524000, 0, 0, 224~
## $ product <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자이너", "~
## $ store   <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산점", "~
## $ day     <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152, 26, 2~
## $ count   <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666667, 2.~
## $ week    <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000000, 0.~
## $ cycle   <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, 2, 13, ~
colSums(is.na(data))
## cust_id  gender   index   total     max  refund product   store     day   count 
##       0    2482       0       0       0       0       0       0       0       0 
##    week   cycle 
##       0       0
library(recipes)
## 
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
recipe(gender~.,data=data) %>% step_YeoJohnson(total,refund,max,day,count,cycle,week) %>% 
  step_center(total,refund,max,day,count,cycle,week) %>% 
  step_scale(total,refund,max,day,count,cycle,week) %>% 
  prep() %>% juice()->data1
data1 %>% filter(index=="train")->train
data1 %>% filter(index=="test")->test

library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary,
                   classProbs=TRUE)
train(gender~.,data=train,method="rpart",metric="ROC",
      trControl=ctrl)->rpartfit
rpartfit
## CART 
## 
## 3500 samples
##   11 predictor
##    2 classes: '남성', '여성' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3149, 3150, 3151, 3150, 3150, 3150, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.005319149  0.6273140  0.3706859  0.7903230
##   0.006838906  0.6037011  0.3195061  0.8100163
##   0.007598784  0.6037011  0.3195061  0.8100163
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005319149.
predict(rpartfit,test,type='prob')->pred_fit
predict(rpartfit,test,type='raw')->pred_fit1

head(pred_fit)
##        남성      여성
## 1 0.2635710 0.7364290
## 2 0.2635710 0.7364290
## 3 0.2635710 0.7364290
## 4 0.5528302 0.4471698
## 5 0.5528302 0.4471698
## 6 0.3072289 0.6927711
head(pred_fit1)
## [1] 여성 여성 여성 남성 남성 여성
## Levels: 남성 여성