rm(list=ls())
getwd()
## [1] "C:/R"
setwd("c:/R")
ls()
## character(0)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x_test<-read.csv('X_test.csv')
x_train<-read.csv('X_train.csv')
y_train<-read.csv('y_train.csv')
x_train %>% glimpse
## Rows: 3,500
## Columns: 10
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산~
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
y_train %>% glimpse
## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,~
x_test %>% glimpse
## Rows: 2,482
## Columns: 10
## $ cust_id <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액 <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액 <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액 <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품 <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점 <chr> "부산본점", "잠실점", "본 점", "부산본점", "청량리점",~
## $ 내점일수 <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율 <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기 <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
left_join(x_train,y_train,by="cust_id") %>% mutate(index="train")->train
train %>% glimpse
## Rows: 3,500
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산~
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index <chr> "train", "train", "train", "train", "train", "train", "~
x_test %>% mutate(index='test')->test
test %>% glimpse
## Rows: 2,482
## Columns: 11
## $ cust_id <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액 <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액 <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액 <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품 <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점 <chr> "부산본점", "잠실점", "본 점", "부산본점", "청량리점",~
## $ 내점일수 <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율 <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기 <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
## $ index <chr> "test", "test", "test", "test", "test", "test", "test",~
bind_rows(train,test)->full
full %>% glimpse
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산~
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index <chr> "train", "train", "train", "train", "train", "train", "~
full$gender<-as.factor(full$gender)
full$gender<-ifelse(full$gender==0,"여성","남성")
colSums(is.na(full))
## cust_id 총구매액 최대구매액 환불금액 주구매상품
## 0 0 0 3906 0
## 주구매지점 내점일수 내점당구매건수 주말방문비율 구매주기
## 0 0 0 0 0
## gender index
## 2482 0
full$환불금액<-ifelse(is.na(full$환불금액),0,full$환불금액)
names(full)
## [1] "cust_id" "총구매액" "최대구매액" "환불금액"
## [5] "주구매상품" "주구매지점" "내점일수" "내점당구매건수"
## [9] "주말방문비율" "구매주기" "gender" "index"
full %>% rename(total="총구매액",
max="최대구매액",
refund="환불금액",
product="주구매상품",
store="주구매지점",
day="내점일수",
count="내점당구매건수",
week="주말방문비율",
cycle="구매주기") %>%
select(cust_id,gender,index,total,max,refund,product,store,day,count,week,cycle)->data
data %>% glimpse()
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender <chr> "여성", "여성", "남성", "남성", "여성", "여성", "여성", "여성"~
## $ index <chr> "train", "train", "train", "train", "train", "train", "train",~
## $ total <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 11379000, 1005~
## $ max <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000, 761200~
## $ refund <dbl> 6860000, 300000, 0, 0, 0, 462000, 4582000, 29524000, 0, 0, 224~
## $ product <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자이너", "~
## $ store <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산점", "~
## $ day <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152, 26, 2~
## $ count <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666667, 2.~
## $ week <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000000, 0.~
## $ cycle <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, 2, 13, ~
colSums(is.na(data))
## cust_id gender index total max refund product store day count
## 0 2482 0 0 0 0 0 0 0 0
## week cycle
## 0 0
library(recipes)
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
recipe(gender~.,data=data) %>% step_YeoJohnson(total,refund,max,day,count,cycle,week) %>%
step_center(total,refund,max,day,count,cycle,week) %>%
step_scale(total,refund,max,day,count,cycle,week) %>%
prep() %>% juice()->data1
data1 %>% filter(index=="train")->train
data1 %>% filter(index=="test")->test
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary,
classProbs=TRUE)
train(gender~.,data=train,method="rpart",metric="ROC",
trControl=ctrl)->rpartfit
rpartfit
## CART
##
## 3500 samples
## 11 predictor
## 2 classes: '남성', '여성'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3149, 3150, 3151, 3150, 3150, 3150, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.005319149 0.6273140 0.3706859 0.7903230
## 0.006838906 0.6037011 0.3195061 0.8100163
## 0.007598784 0.6037011 0.3195061 0.8100163
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005319149.
predict(rpartfit,test,type='prob')->pred_fit
predict(rpartfit,test,type='raw')->pred_fit1
head(pred_fit)
## 남성 여성
## 1 0.2635710 0.7364290
## 2 0.2635710 0.7364290
## 3 0.2635710 0.7364290
## 4 0.5528302 0.4471698
## 5 0.5528302 0.4471698
## 6 0.3072289 0.6927711
head(pred_fit1)
## [1] 여성 여성 여성 남성 남성 여성
## Levels: 남성 여성