# 제2작업형
# 1. 기출문제
# 'X_train.csv' : 고객의 상품 구매 속성
# 'y_train.csv' : 고객의 성별 데이터
# 'X_test.csv' : 평가용 데이터, 종속변수 gender 없음
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
x_test<-read.csv('X_test.csv')
x_train<-read.csv('X_train.csv')
y_train<-read.csv('y_train.csv')
x_train %>% glimpse
## Rows: 3,500
## Columns: 10
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산~
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
y_train %>% glimpse
## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,~
# 1 train dataset
left_join(x_train,y_train,by='cust_id') %>% mutate(index='train')->train
train %>% glimpse
## Rows: 3,500
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산~
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index <chr> "train", "train", "train", "train", "train", "train", "~
x_test %>% mutate(index='test')->test
test %>% glimpse
## Rows: 2,482
## Columns: 11
## $ cust_id <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액 <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액 <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액 <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품 <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점 <chr> "부산본점", "잠실점", "본 점", "부산본점", "청량리점",~
## $ 내점일수 <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율 <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기 <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
## $ index <chr> "test", "test", "test", "test", "test", "test", "test",~
bind_rows(train,test)->full
# 2 목표변수 및 독립변수의 변환
# caret 패키지 목표변수의 범주형 변수
full$gender<-ifelse(full$gender==0,"male","female")
full$gender<-as.factor(full$gender)
full$index<-as.factor(full$index)
names(full)
## [1] "cust_id" "총구매액" "최대구매액" "환불금액"
## [5] "주구매상품" "주구매지점" "내점일수" "내점당구매건수"
## [9] "주말방문비율" "구매주기" "gender" "index"
data<-full %>% rename(total="총구매액",
max="최대구매액",
refund="환불금액",
product="주구매상품",
store="주구매지점",
day="내점일수",
count= "내점당구매건수",
week="주말방문비율",
cycle="구매주기") %>%
select(cust_id,index,gender,total,max,refund,product,store,day,count,week,cycle)
# 3. 결측값 확인
colSums(is.na(data))
## cust_id index gender total max refund product store day count
## 0 0 2482 0 0 3906 0 0 0 0
## week cycle
## 0 0
data$refund<-ifelse(is.na(data$refund),0,data$refund)
colSums(is.na(data))
## cust_id index gender total max refund product store day count
## 0 0 2482 0 0 0 0 0 0 0
## week cycle
## 0 0
# 4. recipe 데이터 전처리
library(recipes)
recipe(gender~.,data=data) %>% step_YeoJohnson(total,max,refund,day,count,week,cycle) %>%
step_scale(total,max,refund,day,count,week,cycle) %>%
step_center(total,max,refund,day,count,week,cycle) %>%
prep() %>% juice()->data1
# 5. train data 분리
data1 %>% filter(index=="train") %>% select(-index)->train
data1 %>% filter(index=="test") %>% select(-index)->test
# 6. caret 패키지 학습하기
library(caret)
ctrl<-trainControl(method='cv',number=5,
summaryFunction = twoClassSummary,
classProbs =TRUE)
train(gender~.,data=train,
method='rpart',
metric="ROC",
trControl=ctrl)->rffit
train(gender~.,data=train,
method='glm',family=binomial,
metric="ROC",
trControl=ctrl)->rffit1
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
train(gender~.,data=train,
method='knn',
metric="ROC",
trControl=ctrl)->rffit2
predict(rffit,test)->pred_fit
class(pred_fit)
## [1] "factor"
predict(rffit,test,type="prob")->pred_fit1
predict(rffit,test,type="raw")->pred_fit2
options(scipen = 999)
# 7 답안작성
# cust_id, gender
names(pred_fit1)[1]<-"gender"
bind_cols(x_test,pred_fit1) %>% select(cust_id,gender)->df
#write.csv(df,"2022.csv",row.names=FALSE)
#write.csv(df,"2023.csv",row.names=TRUE)
# row.names=TRUE # TRUE면 행 이름을 CSV 파일에 포함하여 저장한다.