데이터 출처 https://www.kaggle.com/regivm/retailtransactiondata

1.필요 라이브러리 설치

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(doBy)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year

2. 데이불 불러오기: 위의 링크 해놓은 캐글에서 데이터 다운 받은후 실행

Retail_Data_R<-read.csv("Retail_Data_Response.csv")
Retail_Data_T<-read.csv("Retail_Data_Transactions.csv")

3. 데이터 구조 확인

c(str(Retail_Data_R),str(Retail_Data_T), dim(Retail_Data_R), dim(Retail_Data_T))
## 'data.frame':    6884 obs. of  2 variables:
##  $ customer_id: Factor w/ 6884 levels "CS1112","CS1113",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ response   : int  0 0 1 1 1 0 1 0 0 0 ...
## 'data.frame':    125000 obs. of  3 variables:
##  $ customer_id: Factor w/ 6889 levels "CS1112","CS1113",..: 4184 3657 1011 106 739 4428 1613 4791 4929 2691 ...
##  $ trans_date : Factor w/ 1401 levels "01-Apr-12","01-Apr-13",..: 473 675 1164 726 912 1181 242 1347 339 881 ...
##  $ tran_amount: int  35 39 52 99 78 81 93 89 76 75 ...
## [1]   6884      2 125000      3

4. 날짜 데이터 형식 변경하기: factor를 date형식으로

Retail_Data_T$trans_date<-dmy(Retail_Data_T$trans_date)

5. 고객수 확인

c(length(Retail_Data_R$customer_id),length(unique(Retail_Data_T$customer_id)))
## [1] 6884 6889

6.구매기간 확인

c(max(Retail_Data_T$trans_date),min(Retail_Data_T$trans_date))
## [1] "2015-03-16" "2011-05-16"

7.고객별 구매금액의 합계, 평균, 중위수, 구매횟수(*데이터에 NA 값이 있을 때는 na.rm=T 사용, length에는 적용 안됨)

Cust_Sum<-summaryBy(tran_amount~customer_id, data = Retail_Data_T, 
                    FUN = c(sum, mean, median, length))
head(Cust_Sum)
##   customer_id tran_amount.sum tran_amount.mean tran_amount.median
## 1      CS1112            1012         67.46667               63.0
## 2      CS1113            1490         74.50000               79.5
## 3      CS1114            1432         75.36842               79.0
## 4      CS1115            1659         75.40909               72.5
## 5      CS1116             857         65.92308               60.0
## 6      CS1117            1185         69.70588               67.0
##   tran_amount.length
## 1                 15
## 2                 20
## 3                 19
## 4                 22
## 5                 13
## 6                 17

8.컬럼 이름 변경: 컬럼 이름 축약해서 보기 편하게 하기 위해

Cust_Sum<-rename(Cust_Sum, sum = tran_amount.sum, mean = tran_amount.mean,
                             median= tran_amount.median, length=tran_amount.length)

9.1 데이터 분포 보기-sum

hist(Cust_Sum$sum)

9.2 데이터분포 보기-mean

hist(Cust_Sum$mean)

9.3 데이터 분포보기- median

hist(Cust_Sum$median)

9.4 데이터 분포보기- length

hist(Cust_Sum$length)

10. 구매 데이터를 사용해 고객의 구매 주기를 계산

10.1. 구매데이터 고객별, 날짜별로 정렬

Retail_Data_T<-Retail_Data_T[order(Retail_Data_T$customer_id, Retail_Data_T$trans_date),]
head(Retail_Data_T)
##       customer_id trans_date tran_amount
## 77248      CS1112 2011-06-15          56
## 89150      CS1112 2011-08-19          96
## 68207      CS1112 2011-10-02          60
## 36487      CS1112 2012-04-08          56
## 93075      CS1112 2012-06-24          52
## 67736      CS1112 2012-07-03          81

10.2. 각 고객의 구매일간 주기를 계산(첫번째 구매일과 두 번째 구매일간의 차이)

Retail_Data_T$trans_date_lag<-shift(Retail_Data_T$trans_date, n=1, fill=NA, type = "lag")
Retail_Data_T$trans_date_diff<-Retail_Data_T$trans_date - Retail_Data_T$trans_date_lag
Retail_Data_T$trans_date_final<-ifelse(Retail_Data_T$customer_id == shift(Retail_Data_T$customer_id, n=1, fill = NA, type = "lag"),
       Retail_Data_T$trans_date-shift(Retail_Data_T$trans_date, n=1, fill=NA, type = "lag"),
       NA)
head(Retail_Data_T)
##       customer_id trans_date tran_amount trans_date_lag trans_date_diff
## 77248      CS1112 2011-06-15          56           <NA>         NA days
## 89150      CS1112 2011-08-19          96     2011-06-15         65 days
## 68207      CS1112 2011-10-02          60     2011-08-19         44 days
## 36487      CS1112 2012-04-08          56     2011-10-02        189 days
## 93075      CS1112 2012-06-24          52     2012-04-08         77 days
## 67736      CS1112 2012-07-03          81     2012-06-24          9 days
##       trans_date_final
## 77248               NA
## 89150               65
## 68207               44
## 36487              189
## 93075               77
## 67736                9

11. 고객별 구매주기 평균 계산

Cust_Pattern<-summaryBy(trans_date_final~customer_id, data = Retail_Data_T, FUN= mean, na.rm =TRUE)
head(Cust_Pattern)
##   customer_id trans_date_final.mean
## 1      CS1112              93.50000
## 2      CS1113              71.26316
## 3      CS1114              72.72222
## 4      CS1115              62.04762
## 5      CS1116              96.25000
## 6      CS1117              71.18750

12.Retail_Data_R 데이터 병합

Cust_Master<-merge((merge(Cust_Sum,Cust_Pattern,all.x=T,by="customer_id")),
                   Retail_Data_R,all=T,by="customer_id")

13. 데이터간 상관성 파악

plot(Cust_Master[,2:length(Cust_Master)])

RFM기분에 따른 고객 분류

14. 구매주기기준 고객 분류_H,M,L

q1<-quantile(Cust_Master$trans_date_final.mean)
Cust_Master$Recency<-ifelse(Cust_Master$trans_date_final.mean<q1[2],"H",
                              ifelse(Cust_Master$trans_date_final.mean<q1[4],"M","L"))
head(Cust_Master)
##   customer_id  sum     mean median length trans_date_final.mean response
## 1      CS1112 1012 67.46667   63.0     15              93.50000        0
## 2      CS1113 1490 74.50000   79.5     20              71.26316        0
## 3      CS1114 1432 75.36842   79.0     19              72.72222        1
## 4      CS1115 1659 75.40909   72.5     22              62.04762        1
## 5      CS1116  857 65.92308   60.0     13              96.25000        1
## 6      CS1117 1185 69.70588   67.0     17              71.18750        0
##   Recency
## 1       L
## 2       M
## 3       M
## 4       M
## 5       L
## 6       M

15.구매횟수에 따른 고객 분류_H,M,L

q2<-quantile(Cust_Master$length)
Cust_Master$Frequency<-ifelse(Cust_Master$length<q2[2],"L",
                              ifelse(Cust_Master$length<q2[4],"M","H"))
head(Cust_Master)
##   customer_id  sum     mean median length trans_date_final.mean response
## 1      CS1112 1012 67.46667   63.0     15              93.50000        0
## 2      CS1113 1490 74.50000   79.5     20              71.26316        0
## 3      CS1114 1432 75.36842   79.0     19              72.72222        1
## 4      CS1115 1659 75.40909   72.5     22              62.04762        1
## 5      CS1116  857 65.92308   60.0     13              96.25000        1
## 6      CS1117 1185 69.70588   67.0     17              71.18750        0
##   Recency Frequency
## 1       L         M
## 2       M         M
## 3       M         M
## 4       M         H
## 5       L         L
## 6       M         M

16. 구매 금액 기준 고객 분류_H, M, L

q3<-quantile(Cust_Master$sum)
Cust_Master$Monetary<-ifelse(Cust_Master$sum<q3[2],"L",
                                          ifelse(Cust_Master$sum<q3[4],"M","H"))
head(Cust_Master)
##   customer_id  sum     mean median length trans_date_final.mean response
## 1      CS1112 1012 67.46667   63.0     15              93.50000        0
## 2      CS1113 1490 74.50000   79.5     20              71.26316        0
## 3      CS1114 1432 75.36842   79.0     19              72.72222        1
## 4      CS1115 1659 75.40909   72.5     22              62.04762        1
## 5      CS1116  857 65.92308   60.0     13              96.25000        1
## 6      CS1117 1185 69.70588   67.0     17              71.18750        0
##   Recency Frequency Monetary
## 1       L         M        M
## 2       M         M        M
## 3       M         M        M
## 4       M         H        H
## 5       L         L        M
## 6       M         M        M

17.RFM 기준으로 각 값의 평균과 횟수 구하기

Result<-summaryBy(.~Recency+Frequency+Monetary, data=Cust_Master, FUN=c(mean,length))
head(Result)
##   Recency Frequency Monetary  sum.mean mean.mean median.mean length.mean
## 1       H         H        H 1801.3221  70.67305    70.89798    25.50299
## 2       H         H        M 1412.9514  62.26471    60.29514    22.74306
## 3       H         L        L  454.6667  49.09015    49.58333     9.50000
## 4       H         L        M  891.0000  68.53846    66.00000    13.00000
## 5       H         M        H 1572.1707  75.04506    76.95122    20.95122
## 6       H         M        L  742.5000  45.89179    45.91667    16.16667
##   trans_date_final.mean.mean response.mean sum.length mean.length
## 1                   52.90972     0.1831091       1338        1338
## 2                   57.00995     0.1875000        144         144
## 3                   50.44815            NA          6           6
## 4                   58.25000     0.0000000          1           1
## 5                   57.60931     0.2195122         41          41
## 6                   56.58502     0.1666667          6           6
##   median.length length.length trans_date_final.mean.length response.length
## 1          1338          1338                         1338            1338
## 2           144           144                          144             144
## 3             6             6                            6               6
## 4             1             1                            1               1
## 5            41            41                           41              41
## 6             6             6                            6               6

18.분석에 필요한 컬럼만 선택하여 컬럼명과 순서 변경: 결과 인식을 용이하게 하기 위해

names(Result)
##  [1] "Recency"                      "Frequency"                   
##  [3] "Monetary"                     "sum.mean"                    
##  [5] "mean.mean"                    "median.mean"                 
##  [7] "length.mean"                  "trans_date_final.mean.mean"  
##  [9] "response.mean"                "sum.length"                  
## [11] "mean.length"                  "median.length"               
## [13] "length.length"                "trans_date_final.mean.length"
## [15] "response.length"
Result_F<-select(Result,-(contains("median")), -(starts_with("response")),-(starts_with("mean")))
Result_F<-rename(Result_F, Monetary.mean=sum.mean, Frequency.mean=length.mean, 
                 Recency.mean=trans_date_final.mean.mean, Monetary.length=sum.length,
                 Frequency.length=length.length, Recency.length=trans_date_final.mean.length)

Result_F<-Result_F[,c(1,2,3,6,5,4,9,8,7)]
head(Result_F)
##   Recency Frequency Monetary Recency.mean Frequency.mean Monetary.mean
## 1       H         H        H     52.90972       25.50299     1801.3221
## 2       H         H        M     57.00995       22.74306     1412.9514
## 3       H         L        L     50.44815        9.50000      454.6667
## 4       H         L        M     58.25000       13.00000      891.0000
## 5       H         M        H     57.60931       20.95122     1572.1707
## 6       H         M        L     56.58502       16.16667      742.5000
##   Recency.length Frequency.length Monetary.length
## 1           1338             1338            1338
## 2            144              144             144
## 3              6                6               6
## 4              1                1               1
## 5             41               41              41
## 6              6                6               6

19. 엑셀 파일로 추출하여 값 확인하기

write.xlsx(Result_F, "RFM_Result.xlsx")