bigdata_part04

# 제2작업형 
# 1. 기출문제 
# 'X_train.csv' : 고객의 상품 구매 속성
# 'y_train.csv' : 고객의 성별 데이터
# 'X_test.csv'  : 평가용 데이터, 종속변수 gender 없음
# 제출해야 할 테스트 데이터는 라벨이 존재하지 않기 때문에, 시험 문제 내에서는 
# Roc Auc를 구할 수 없고, 이진분류 확률값만 구할 수 있습니다
library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: ggplot2

## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: lattice

library(recipes)

## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 다음의 패키지를 부착합니다: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

x_test<-read.csv('X_test.csv')
x_train<-read.csv('X_train.csv')
y_train<-read.csv('y_train.csv')

x_train %>% glimpse

## Rows: 3,500
## Columns: 10
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산~
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~

y_train %>% glimpse

## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ gender  <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,~

# 1. train dataset 

left_join(x_train,y_train,by='cust_id') %>% mutate(index='train')->train
train %>% glimpse

## Rows: 3,500
## Columns: 12
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900~
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,~
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,~
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자~
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산~
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152~
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666~
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000~
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, ~
## $ gender         <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0~
## $ index          <chr> "train", "train", "train", "train", "train", "train", "~

x_test %>% mutate(index='test')->test
test %>% glimpse

## Rows: 2,482
## Columns: 11
## $ cust_id        <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3~
## $ 총구매액       <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000~
## $ 최대구매액     <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000~
## $ 환불금액       <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, ~
## $ 주구매상품     <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화~
## $ 주구매지점     <chr> "부산본점", "잠실점", "본  점", "부산본점", "청량리점",~
## $ 내점일수       <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, ~
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20~
## $ 주말방문비율   <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500~
## $ 구매주기       <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2~
## $ index          <chr> "test", "test", "test", "test", "test", "test", "test",~

bind_rows(train,test)->full

# 2 목표변수 및 독립변수의 변환
# caret 패키지 목표변수의 범주형 변수
full$gender<-ifelse(full$gender==0,"남성","여성")
full$gender<-as.factor(full$gender)
full$index<-as.factor(full$index)
names(full)

##  [1] "cust_id"        "총구매액"       "최대구매액"     "환불금액"      
##  [5] "주구매상품"     "주구매지점"     "내점일수"       "내점당구매건수"
##  [9] "주말방문비율"   "구매주기"       "gender"         "index"

data<-full %>% rename(total="총구매액",
                      max="최대구매액",
                      refund="환불금액",
                      product="주구매상품",
                      store="주구매지점",
                      day="내점일수",
                      count= "내점당구매건수",
                      week="주말방문비율",
                      cycle="구매주기") %>% 
  select(cust_id,index,gender,total,max,refund,product,store,day,count,week,cycle)

# 3. 결측값 확인
colSums(is.na(data))

## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0    3906       0       0       0       0 
##    week   cycle 
##       0       0

data$refund<-ifelse(is.na(data$refund),0,data$refund)
colSums(is.na(data))

## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0       0       0       0       0       0 
##    week   cycle 
##       0       0

# 4. recipe 데이터 전처리
library(recipes)
recipe(gender~.,data=data) %>% step_YeoJohnson(total,max,refund,day,count,week,cycle) %>% 
  step_scale(total,max,refund,day,count,week,cycle) %>% 
  step_center(total,max,refund,day,count,week,cycle) %>% 
  prep() %>% juice()->data1

# 5. train data 분리
data1 %>% filter(index=="train") %>% select(-index)->train
data1 %>% filter(index=="test") %>% select(-index)->test

# 6. caret 패키지 학습하기
library(caret)
ctrl<-trainControl(method='cv',number=5,
                   summaryFunction = twoClassSummary,
                   classProbs =TRUE)

train(gender~.,data=train,
      method='rpart',
      metric="ROC",
      trControl=ctrl)->rffit

train(gender~.,data=train,
      method='glm',family=binomial,
      metric="ROC",
      trControl=ctrl)->rffit1

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

train(gender~.,data=train,
      method='knn',
      metric="ROC",
      trControl=ctrl)->rffit2

predict(rffit,test)->pred_fit
class(pred_fit)

## [1] "factor"

predict(rffit,test,type="prob")->pred_fit1
predict(rffit,test,type="raw")->pred_fit2
options(scipen = 999)


# 7 답안작성
# cust_id, gender
head(pred_fit1)

##        남성      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289

names(pred_fit1)[1]<-"gender"
head(pred_fit1)

##      gender      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289

bind_cols(x_test,pred_fit1) %>% select(cust_id,gender)->df
head(df)

##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711

setwd('c:/data')
write.csv(df,"2022.csv",row.names=FALSE)
read.csv("2022.csv") %>% head

##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711

write.csv(df,"2023.csv",row.names=TRUE)
read.csv("2023.csv") %>% head

##   X cust_id    gender
## 1 1    3500 0.7364290
## 2 2    3501 0.7364290
## 3 3    3502 0.7364290
## 4 4    3503 0.4471698
## 5 5    3504 0.4471698
## 6 6    3505 0.6927711

# row.names=TRUE  # TRUE면 행 이름을 CSV 파일에 포함하여 저장한다

bigdata_part04_1003

kim kye chul

2022 6 1