setwd("C:/Users/Administrator/Desktop/BIG DATA")
read.csv("commerce.csv") -> df
# 1. NA 데이터 확인
View(df)
colSums(is.na(df)) # ID Na 값이 많음 ## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 0 0 0 0
## CustomerID Country
## 135080 0
# Time 변수 정리해주고 my_df 변경
df %>%
mutate(mdy_hm = mdy_hm(InvoiceDate),
time_ymd = InvoiceDate %>% str_split(" ", simplify = TRUE)
%>% data.frame() %>% pull(X1) %>% mdy) -> my_df
# 변수를 선택해주고, 날짜 변수를 통한 파생변수 요일과 달을 뽑아낸다.
my_df %>%
select(CustomerID, time_ymd, StockCode, Quantity, UnitPrice, Country) %>%
mutate(days = wday(time_ymd, label = TRUE, abbr = TRUE),
month = month(time_ymd, label= TRUE, abbr= TRUE)) %>%
mutate(money = Quantity * UnitPrice)-> my_dfR : 최근 기간
F : 구매 빈도
M : 금엑 >> y- 의 구매 YES/NO 의 변수를 만든 후, -> Modelling 으로 도입한다.
# 함수 만들기
#지난 달 구매에 대한 구매 유무를 계산하는 방식
#지난 달 소비자 아이디의 구매 빈도와 구매금액을 바탕으로 다음 달 예측 모델링 작성
#1. train data 의 고객 ID
#2. lookdata 데이터에서의 고객 아이디 - 다음 달에 살 것인가?
#3. Data 프레임을 money 합산, 아이디 합산(빈도수), y - 최근 시간 - 시간
#4. 최소 빈도수를 나타낸다.
#5. f (frequnecy), m (money), r(recency) 합친 후, buynextmonth 변수 만들기기
processing_RFMdata <- function(month_training, month_lookback) {
train_df <- my_df %>%
filter(month %in% month_training)
lookback_df <- my_df %>%
filter(month == month_lookback)
customer_2months <- train_df$CustomerID %>%unique()
customer_nextmonth <- lookback_df$CustomerID %>%unique()
train_df %>%
group_by(CustomerID) %>%
summarise(money = sum(money)) %>%
ungroup() -> df_train_m
train_df %>%
group_by(CustomerID) %>%
count() %>%
ungroup() %>%
rename(freq= n) -> df_train_f
now_time <- max(train_df$time_ymd)
y <- as.numeric(now_time - train_df$time_ymd)
train_df %>%
mutate(recency = y) %>%
group_by(CustomerID) %>%
summarise(recency = min(recency)) %>%
ungroup() -> df_train_r
df_modelling <- df_train_f %>%
full_join(df_train_m, by = "CustomerID") %>%
full_join(df_train_r, by = "CustomerID") %>%
mutate(BuyNextMonth = case_when(CustomerID %in% customer_nextmonth ~ "Yes",
TRUE ~ "No")) %>%
mutate(BuyNextMonth = as.factor(BuyNextMonth))
return(df_modelling)
}month_training23 <- c("2", "3")
processing_RFMdata(month_training = month_training23,
month_lookback = c("3")) -> df_modelling
df_modelling %>% head()## # A tibble: 6 x 5
## CustomerID freq money recency BuyNextMonth
## <int> <int> <dbl> <dbl> <fct>
## 1 12350 17 334. 57 No
## 2 12352 38 1562. 9 Yes
## 3 12359 80 1839. 52 No
## 4 12361 10 190. 34 No
## 5 12362 27 479. 42 No
## 6 12365 22 641. 38 No
df_forMl <- df_modelling %>%
select(-CustomerID) %>%
mutate_if(is.numeric, function(x){(x -min(x)/(max(x)- min(x)))}) #정규화 set.seed(1)
flag <- createDataPartition(df_forMl$BuyNextMonth, p=0.8, list= FALSE)
train <- df_forMl[flag,]
test <- df_forMl[-flag,]
# 비교 정제조건 설정 하기
set.seed(1)
number <- 5
repeats <- 3
## TrainCOntrol 함수 : 일관된 비교방법을 후보에게 동일 적용
control <- trainControl(method = "repeatedcv",
number = number,
classProbs = TRUE,
savePredictions = "final",
index = createResample(df_forMl$BuyNextMonth,
repeats*number),
summaryFunction = twoClassSummary,
allowParallel = TRUE)
library(doParallel)
registerDoParallel(cores = detectCores() - 1)
my_models <- c("adaboost", "xgbTree", "svmRadial",
"knn", "gbm", "C5.0", "ranger",
"rf", "nnet", "glm", "lda", "treebag",
"bagFDA", "glmboost", "cforest")
# 모델 훈련 시키기 - 15개의 모델을 한번에 훈련시킨다.
library(caretEnsemble)
set.seed(1)
system.time(model_list1 <- caretList(BuyNextMonth ~.,
data = df_forMl,
trControl = control,
metric = "ROC",
methodList = my_models))## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0703 nan 0.1000 0.0945
## 2 0.9229 nan 0.1000 0.0750
## 3 0.8035 nan 0.1000 0.0594
## 4 0.7043 nan 0.1000 0.0502
## 5 0.6206 nan 0.1000 0.0433
## 6 0.5489 nan 0.1000 0.0347
## 7 0.4871 nan 0.1000 0.0312
## 8 0.4334 nan 0.1000 0.0272
## 9 0.3864 nan 0.1000 0.0232
## 10 0.3451 nan 0.1000 0.0206
## 20 0.1183 nan 0.1000 0.0064
## 40 0.0155 nan 0.1000 0.0008
## 50 0.0057 nan 0.1000 0.0003
##
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
## # weights: 16
## initial value 910.213601
## iter 10 value 816.415266
## iter 20 value 557.731964
## iter 30 value 62.921474
## iter 40 value 3.066346
## iter 50 value 0.048891
## iter 60 value 0.001777
## iter 70 value 0.001008
## final value 0.000093
## converged
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## user system elapsed
## 37.76 2.15 645.47
list_of_result <- lapply(my_models, function(x) {model_list1[[x]]$resample})
total_df <- do.call("bind_rows", list_of_result)
total_df %>% mutate(Model = lapply(my_models, function(x) {rep(x, number*repeats)})
%>% unlist()) -> total_df
head(total_df,10)## ROC Sens Spec Resample Model
## 1 1 1 1 Resample01 adaboost
## 2 1 1 1 Resample02 adaboost
## 3 1 1 1 Resample03 adaboost
## 4 1 1 1 Resample14 adaboost
## 5 1 1 1 Resample12 adaboost
## 6 1 1 1 Resample10 adaboost
## 7 1 1 1 Resample08 adaboost
## 8 1 1 1 Resample06 adaboost
## 9 1 1 1 Resample04 adaboost
## 10 1 1 1 Resample15 adaboost
total_df %>%
select(-Resample) %>%
group_by(Model) %>%
summarise(avg_roc = round(mean(ROC),2),
avg_sens = round(mean(Sens),2),
avg_Spec = mean(Spec)) %>%
ungroup() -> df_resultModelling Visualization
## # A tibble: 15 x 4
## Model avg_roc avg_sens avg_Spec
## <chr> <dbl> <dbl> <dbl>
## 1 adaboost 1 1 1
## 2 bagFDA 1 0.89 1
## 3 C5.0 1 1 1
## 4 cforest 1 1 1
## 5 gbm 1 1 1
## 6 glm 1 1 1
## 7 glmboost 1 1 0.966
## 8 knn 0.96 0.78 0.958
## 9 lda 1 1 0.985
## 10 nnet 1 1 1.00
## 11 ranger 1 1 1
## 12 rf 1 1 1
## 13 svmRadial 1 0.99 0.998
## 14 treebag 1 1 1
## 15 xgbTree 1 1 1
df_result %>%
ggplot(aes(Model, avg_sens))+
geom_col(fill = "indianred3", color = "white")+
coord_flip()+
geom_text(aes(label = avg_sens), color="white", hjust = 1.5)+
theme(panel.grid = element_blank()) -> p1
df_result %>%
ggplot(aes(Model, avg_roc))+
geom_col(fill = "indianred3", color = "white")+
coord_flip()+
geom_text(aes(label = avg_roc), color="white", hjust = 1.5)+
theme(panel.grid = element_blank()) -> p2
df_result %>%
ggplot(aes(Model, avg_Spec))+
geom_col(fill = "indianred3", color = "white")+
coord_flip()+
geom_text(aes(label = avg_Spec), color="white", hjust = 1.5)+
theme(panel.grid = element_blank()) -> p3
gridExtra::grid.arrange(p1, p2, p3, nrow=1)GBM 도입
set.seed(1)
control_ml <- trainControl(method = "repeatedcv",
number= number,
repeats = repeats,
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE)
set.seed(30)
train(BuyNextMonth ~.,
method ="gbm",
data= train,
trControl = control_ml,
metric ="ROC") -> gbm_default## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0709 nan 0.1000 0.0963
## 2 0.9234 nan 0.1000 0.0734
## 3 0.8039 nan 0.1000 0.0594
## 4 0.7047 nan 0.1000 0.0483
## 5 0.6209 nan 0.1000 0.0402
## 6 0.5492 nan 0.1000 0.0353
## 7 0.4874 nan 0.1000 0.0316
## 8 0.4336 nan 0.1000 0.0273
## 9 0.3866 nan 0.1000 0.0236
## 10 0.3453 nan 0.1000 0.0202
## 20 0.1184 nan 0.1000 0.0065
## 40 0.0155 nan 0.1000 0.0008
## 50 0.0057 nan 0.1000 0.0003
predict(gbm_default, test) -> pred
test$BuyNextMonth -> actual
confusionMatrix(pred, actual, positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 93 0
## Yes 0 195
##
## Accuracy : 1
## 95% CI : (0.9873, 1)
## No Information Rate : 0.6771
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6771
## Detection Rate : 0.6771
## Detection Prevalence : 0.6771
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##