setwd("C:/Users/Administrator/Desktop/BIG DATA")

read.csv("commerce.csv") -> df

# 1. NA 데이터 확인 

View(df)

colSums(is.na(df)) # ID Na 값이 많음

##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0           0           0           0           0 
##  CustomerID     Country 
##      135080           0

# Selecting variables 

df %>% 
  filter(Quantity > 0, UnitPrice > 0) -> df

# Time 변수 정리해주고 my_df 변경 

df %>% 
  mutate(mdy_hm = mdy_hm(InvoiceDate),
         time_ymd = InvoiceDate %>% str_split(" ", simplify = TRUE)
          %>% data.frame() %>% pull(X1) %>% mdy) -> my_df


# 변수를 선택해주고, 날짜 변수를 통한 파생변수 요일과 달을 뽑아낸다. 
my_df %>% 
  select(CustomerID, time_ymd, StockCode, Quantity, UnitPrice, Country) %>% 
  mutate(days = wday(time_ymd, label = TRUE, abbr = TRUE),
         month = month(time_ymd, label= TRUE, abbr= TRUE)) %>% 
  mutate(money = Quantity * UnitPrice)-> my_df

R : 최근 기간

F : 구매 빈도

M : 금엑 >> y- 의 구매 YES/NO 의 변수를 만든 후, -> Modelling 으로 도입한다.

# 함수 만들기 


#지난 달 구매에 대한 구매 유무를 계산하는 방식  
#지난 달 소비자 아이디의 구매 빈도와 구매금액을 바탕으로 다음 달 예측 모델링 작성 

#1. train data 의 고객 ID 
#2. lookdata 데이터에서의 고객 아이디 - 다음 달에 살 것인가? 
#3. Data 프레임을 money 합산, 아이디 합산(빈도수), y - 최근 시간  - 시간 
#4. 최소 빈도수를 나타낸다. 
#5. f (frequnecy), m (money), r(recency) 합친 후, buynextmonth 변수 만들기기 


processing_RFMdata <- function(month_training, month_lookback) {
  
  train_df <- my_df %>% 
    filter(month %in% month_training)
  
  lookback_df <- my_df %>% 
    filter(month == month_lookback)
  
  customer_2months <- train_df$CustomerID %>%unique()
  
  customer_nextmonth <- lookback_df$CustomerID %>%unique()
  
  train_df %>% 
    group_by(CustomerID) %>% 
    summarise(money = sum(money)) %>% 
    ungroup() -> df_train_m
  
  train_df %>% 
    group_by(CustomerID) %>% 
    count() %>% 
    ungroup() %>% 
    rename(freq= n) -> df_train_f
  
  now_time <- max(train_df$time_ymd)
  y <- as.numeric(now_time - train_df$time_ymd)
  
  
  train_df %>% 
  mutate(recency = y) %>% 
  group_by(CustomerID) %>% 
  summarise(recency = min(recency)) %>% 
  ungroup() -> df_train_r
  

  df_modelling <- df_train_f %>% 
  full_join(df_train_m, by = "CustomerID") %>% 
  full_join(df_train_r, by = "CustomerID") %>% 
  mutate(BuyNextMonth = case_when(CustomerID %in% customer_nextmonth ~ "Yes", 
                                  TRUE ~ "No")) %>% 
  mutate(BuyNextMonth = as.factor(BuyNextMonth))
  
  
  return(df_modelling)
  
}

month_training23 <- c("2", "3")

processing_RFMdata(month_training = month_training23, 
                   month_lookback = c("3")) -> df_modelling

df_modelling %>% head()

## # A tibble: 6 x 5
##   CustomerID  freq money recency BuyNextMonth
##        <int> <int> <dbl>   <dbl> <fct>       
## 1      12350    17  334.      57 No          
## 2      12352    38 1562.       9 Yes         
## 3      12359    80 1839.      52 No          
## 4      12361    10  190.      34 No          
## 5      12362    27  479.      42 No          
## 6      12365    22  641.      38 No

df_forMl <- df_modelling %>% 
  select(-CustomerID) %>% 
  mutate_if(is.numeric, function(x){(x -min(x)/(max(x)- min(x)))}) #정규화

set.seed(1)

flag <- createDataPartition(df_forMl$BuyNextMonth, p=0.8, list= FALSE)

train <- df_forMl[flag,]
test <- df_forMl[-flag,]

# 비교 정제조건 설정 하기 

set.seed(1)
number <- 5
repeats <- 3

## TrainCOntrol 함수 : 일관된 비교방법을 후보에게 동일 적용  

control <- trainControl(method = "repeatedcv",
                        number = number,
                        classProbs = TRUE, 
                        savePredictions = "final",
                        index = createResample(df_forMl$BuyNextMonth,
                                               repeats*number), 
                        summaryFunction = twoClassSummary, 
                        allowParallel = TRUE)

library(doParallel)
registerDoParallel(cores = detectCores() - 1)



my_models <- c("adaboost", "xgbTree", "svmRadial", 
               "knn",  "gbm", "C5.0", "ranger",
               "rf", "nnet", "glm", "lda", "treebag", 
               "bagFDA", "glmboost", "cforest")


# 모델 훈련 시키기 - 15개의 모델을 한번에 훈련시킨다. 
library(caretEnsemble)
set.seed(1)
system.time(model_list1 <- caretList(BuyNextMonth ~., 
                                     data = df_forMl,
                                     trControl = control,
                                     metric = "ROC", 
                                     methodList = my_models))

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0703             nan     0.1000    0.0945
##      2        0.9229             nan     0.1000    0.0750
##      3        0.8035             nan     0.1000    0.0594
##      4        0.7043             nan     0.1000    0.0502
##      5        0.6206             nan     0.1000    0.0433
##      6        0.5489             nan     0.1000    0.0347
##      7        0.4871             nan     0.1000    0.0312
##      8        0.4334             nan     0.1000    0.0272
##      9        0.3864             nan     0.1000    0.0232
##     10        0.3451             nan     0.1000    0.0206
##     20        0.1183             nan     0.1000    0.0064
##     40        0.0155             nan     0.1000    0.0008
##     50        0.0057             nan     0.1000    0.0003
## 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## # weights:  16
## initial  value 910.213601 
## iter  10 value 816.415266
## iter  20 value 557.731964
## iter  30 value 62.921474
## iter  40 value 3.066346
## iter  50 value 0.048891
## iter  60 value 0.001777
## iter  70 value 0.001008
## final  value 0.000093 
## converged
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

##    user  system elapsed 
##   37.76    2.15  645.47

list_of_result <- lapply(my_models, function(x) {model_list1[[x]]$resample})


total_df <- do.call("bind_rows", list_of_result)


total_df %>% mutate(Model = lapply(my_models, function(x) {rep(x, number*repeats)})
                    %>% unlist()) -> total_df


head(total_df,10)

##    ROC Sens Spec   Resample    Model
## 1    1    1    1 Resample01 adaboost
## 2    1    1    1 Resample02 adaboost
## 3    1    1    1 Resample03 adaboost
## 4    1    1    1 Resample14 adaboost
## 5    1    1    1 Resample12 adaboost
## 6    1    1    1 Resample10 adaboost
## 7    1    1    1 Resample08 adaboost
## 8    1    1    1 Resample06 adaboost
## 9    1    1    1 Resample04 adaboost
## 10   1    1    1 Resample15 adaboost

total_df %>% 
  select(-Resample) %>% 
  group_by(Model) %>% 
  summarise(avg_roc = round(mean(ROC),2), 
            avg_sens = round(mean(Sens),2),
            avg_Spec = mean(Spec)) %>% 
  ungroup() -> df_result

Modelling Visualization

df_result

## # A tibble: 15 x 4
##    Model     avg_roc avg_sens avg_Spec
##    <chr>       <dbl>    <dbl>    <dbl>
##  1 adaboost     1        1       1    
##  2 bagFDA       1        0.89    1    
##  3 C5.0         1        1       1    
##  4 cforest      1        1       1    
##  5 gbm          1        1       1    
##  6 glm          1        1       1    
##  7 glmboost     1        1       0.966
##  8 knn          0.96     0.78    0.958
##  9 lda          1        1       0.985
## 10 nnet         1        1       1.00 
## 11 ranger       1        1       1    
## 12 rf           1        1       1    
## 13 svmRadial    1        0.99    0.998
## 14 treebag      1        1       1    
## 15 xgbTree      1        1       1

df_result %>% 
  ggplot(aes(Model, avg_sens))+
  geom_col(fill = "indianred3", color = "white")+
  coord_flip()+
  geom_text(aes(label = avg_sens),  color="white", hjust = 1.5)+
  theme(panel.grid = element_blank()) -> p1

df_result %>% 
  ggplot(aes(Model, avg_roc))+
  geom_col(fill = "indianred3", color = "white")+
  coord_flip()+
  geom_text(aes(label = avg_roc),  color="white", hjust = 1.5)+
  theme(panel.grid = element_blank()) -> p2

df_result %>% 
  ggplot(aes(Model, avg_Spec))+
  geom_col(fill = "indianred3", color = "white")+
  coord_flip()+
  geom_text(aes(label = avg_Spec),  color="white", hjust = 1.5)+
  theme(panel.grid = element_blank()) -> p3


gridExtra::grid.arrange(p1, p2, p3, nrow=1)

GBM 도입

set.seed(1)

control_ml <- trainControl(method = "repeatedcv", 
                          number= number, 
                          repeats = repeats,
                          classProbs = TRUE, 
                          summaryFunction = twoClassSummary, 
                          allowParallel = TRUE)


set.seed(30)

train(BuyNextMonth ~., 
      method ="gbm",
      data= train, 
      trControl = control_ml, 
      metric ="ROC") -> gbm_default

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0709             nan     0.1000    0.0963
##      2        0.9234             nan     0.1000    0.0734
##      3        0.8039             nan     0.1000    0.0594
##      4        0.7047             nan     0.1000    0.0483
##      5        0.6209             nan     0.1000    0.0402
##      6        0.5492             nan     0.1000    0.0353
##      7        0.4874             nan     0.1000    0.0316
##      8        0.4336             nan     0.1000    0.0273
##      9        0.3866             nan     0.1000    0.0236
##     10        0.3453             nan     0.1000    0.0202
##     20        0.1184             nan     0.1000    0.0065
##     40        0.0155             nan     0.1000    0.0008
##     50        0.0057             nan     0.1000    0.0003

predict(gbm_default, test) -> pred

test$BuyNextMonth -> actual


confusionMatrix(pred, actual, positive = "Yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   93   0
##        Yes   0 195
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9873, 1)
##     No Information Rate : 0.6771     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6771     
##          Detection Rate : 0.6771     
##    Detection Prevalence : 0.6771     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Yes        
##

고객구매예측

DOEUN

2020-04-07

Modelling Visualization