1.필요 라이브러리 설치
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(doBy)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
2. 데이불 불러오기: 위의 링크 해놓은 캐글에서 데이터 다운 받은후 실행
Retail_Data_R<-read.csv("Retail_Data_Response.csv")
Retail_Data_T<-read.csv("Retail_Data_Transactions.csv")
3. 데이터 구조 확인
c(str(Retail_Data_R),str(Retail_Data_T), dim(Retail_Data_R), dim(Retail_Data_T))
## 'data.frame': 6884 obs. of 2 variables:
## $ customer_id: Factor w/ 6884 levels "CS1112","CS1113",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ response : int 0 0 1 1 1 0 1 0 0 0 ...
## 'data.frame': 125000 obs. of 3 variables:
## $ customer_id: Factor w/ 6889 levels "CS1112","CS1113",..: 4184 3657 1011 106 739 4428 1613 4791 4929 2691 ...
## $ trans_date : Factor w/ 1401 levels "01-Apr-12","01-Apr-13",..: 473 675 1164 726 912 1181 242 1347 339 881 ...
## $ tran_amount: int 35 39 52 99 78 81 93 89 76 75 ...
## [1] 6884 2 125000 3
4. 날짜 데이터 형식 변경하기: factor를 date형식으로
Retail_Data_T$trans_date<-dmy(Retail_Data_T$trans_date)
5. 고객수 확인
c(length(Retail_Data_R$customer_id),length(unique(Retail_Data_T$customer_id)))
## [1] 6884 6889
6.구매기간 확인
c(max(Retail_Data_T$trans_date),min(Retail_Data_T$trans_date))
## [1] "2015-03-16" "2011-05-16"
7.고객별 구매금액의 합계, 평균, 중위수, 구매횟수(*데이터에 NA 값이 있을 때는 na.rm=T 사용, length에는 적용 안됨)
Cust_Sum<-summaryBy(tran_amount~customer_id, data = Retail_Data_T,
FUN = c(sum, mean, median, length))
head(Cust_Sum)
## customer_id tran_amount.sum tran_amount.mean tran_amount.median
## 1 CS1112 1012 67.46667 63.0
## 2 CS1113 1490 74.50000 79.5
## 3 CS1114 1432 75.36842 79.0
## 4 CS1115 1659 75.40909 72.5
## 5 CS1116 857 65.92308 60.0
## 6 CS1117 1185 69.70588 67.0
## tran_amount.length
## 1 15
## 2 20
## 3 19
## 4 22
## 5 13
## 6 17
8.컬럼 이름 변경: 컬럼 이름 축약해서 보기 편하게 하기 위해
Cust_Sum<-rename(Cust_Sum, sum = tran_amount.sum, mean = tran_amount.mean,
median= tran_amount.median, length=tran_amount.length)
9.1 데이터 분포 보기-sum
hist(Cust_Sum$sum)

9.2 데이터분포 보기-mean
hist(Cust_Sum$mean)

9.4 데이터 분포보기- length
hist(Cust_Sum$length)

10. 구매 데이터를 사용해 고객의 구매 주기를 계산
10.1. 구매데이터 고객별, 날짜별로 정렬
Retail_Data_T<-Retail_Data_T[order(Retail_Data_T$customer_id, Retail_Data_T$trans_date),]
head(Retail_Data_T)
## customer_id trans_date tran_amount
## 77248 CS1112 2011-06-15 56
## 89150 CS1112 2011-08-19 96
## 68207 CS1112 2011-10-02 60
## 36487 CS1112 2012-04-08 56
## 93075 CS1112 2012-06-24 52
## 67736 CS1112 2012-07-03 81
10.2. 각 고객의 구매일간 주기를 계산(첫번째 구매일과 두 번째 구매일간의 차이)
Retail_Data_T$trans_date_lag<-shift(Retail_Data_T$trans_date, n=1, fill=NA, type = "lag")
Retail_Data_T$trans_date_diff<-Retail_Data_T$trans_date - Retail_Data_T$trans_date_lag
Retail_Data_T$trans_date_final<-ifelse(Retail_Data_T$customer_id == shift(Retail_Data_T$customer_id, n=1, fill = NA, type = "lag"),
Retail_Data_T$trans_date-shift(Retail_Data_T$trans_date, n=1, fill=NA, type = "lag"),
NA)
head(Retail_Data_T)
## customer_id trans_date tran_amount trans_date_lag trans_date_diff
## 77248 CS1112 2011-06-15 56 <NA> NA days
## 89150 CS1112 2011-08-19 96 2011-06-15 65 days
## 68207 CS1112 2011-10-02 60 2011-08-19 44 days
## 36487 CS1112 2012-04-08 56 2011-10-02 189 days
## 93075 CS1112 2012-06-24 52 2012-04-08 77 days
## 67736 CS1112 2012-07-03 81 2012-06-24 9 days
## trans_date_final
## 77248 NA
## 89150 65
## 68207 44
## 36487 189
## 93075 77
## 67736 9
11. 고객별 구매주기 평균 계산
Cust_Pattern<-summaryBy(trans_date_final~customer_id, data = Retail_Data_T, FUN= mean, na.rm =TRUE)
head(Cust_Pattern)
## customer_id trans_date_final.mean
## 1 CS1112 93.50000
## 2 CS1113 71.26316
## 3 CS1114 72.72222
## 4 CS1115 62.04762
## 5 CS1116 96.25000
## 6 CS1117 71.18750
12.Retail_Data_R 데이터 병합
Cust_Master<-merge((merge(Cust_Sum,Cust_Pattern,all.x=T,by="customer_id")),
Retail_Data_R,all=T,by="customer_id")
13. 데이터간 상관성 파악
plot(Cust_Master[,2:length(Cust_Master)])

RFM기분에 따른 고객 분류
14. 구매주기기준 고객 분류_H,M,L
q1<-quantile(Cust_Master$trans_date_final.mean)
Cust_Master$Recency<-ifelse(Cust_Master$trans_date_final.mean<q1[2],"H",
ifelse(Cust_Master$trans_date_final.mean<q1[4],"M","L"))
head(Cust_Master)
## customer_id sum mean median length trans_date_final.mean response
## 1 CS1112 1012 67.46667 63.0 15 93.50000 0
## 2 CS1113 1490 74.50000 79.5 20 71.26316 0
## 3 CS1114 1432 75.36842 79.0 19 72.72222 1
## 4 CS1115 1659 75.40909 72.5 22 62.04762 1
## 5 CS1116 857 65.92308 60.0 13 96.25000 1
## 6 CS1117 1185 69.70588 67.0 17 71.18750 0
## Recency
## 1 L
## 2 M
## 3 M
## 4 M
## 5 L
## 6 M
15.구매횟수에 따른 고객 분류_H,M,L
q2<-quantile(Cust_Master$length)
Cust_Master$Frequency<-ifelse(Cust_Master$length<q2[2],"L",
ifelse(Cust_Master$length<q2[4],"M","H"))
head(Cust_Master)
## customer_id sum mean median length trans_date_final.mean response
## 1 CS1112 1012 67.46667 63.0 15 93.50000 0
## 2 CS1113 1490 74.50000 79.5 20 71.26316 0
## 3 CS1114 1432 75.36842 79.0 19 72.72222 1
## 4 CS1115 1659 75.40909 72.5 22 62.04762 1
## 5 CS1116 857 65.92308 60.0 13 96.25000 1
## 6 CS1117 1185 69.70588 67.0 17 71.18750 0
## Recency Frequency
## 1 L M
## 2 M M
## 3 M M
## 4 M H
## 5 L L
## 6 M M
16. 구매 금액 기준 고객 분류_H, M, L
q3<-quantile(Cust_Master$sum)
Cust_Master$Monetary<-ifelse(Cust_Master$sum<q3[2],"L",
ifelse(Cust_Master$sum<q3[4],"M","H"))
head(Cust_Master)
## customer_id sum mean median length trans_date_final.mean response
## 1 CS1112 1012 67.46667 63.0 15 93.50000 0
## 2 CS1113 1490 74.50000 79.5 20 71.26316 0
## 3 CS1114 1432 75.36842 79.0 19 72.72222 1
## 4 CS1115 1659 75.40909 72.5 22 62.04762 1
## 5 CS1116 857 65.92308 60.0 13 96.25000 1
## 6 CS1117 1185 69.70588 67.0 17 71.18750 0
## Recency Frequency Monetary
## 1 L M M
## 2 M M M
## 3 M M M
## 4 M H H
## 5 L L M
## 6 M M M
17.RFM 기준으로 각 값의 평균과 횟수 구하기
Result<-summaryBy(.~Recency+Frequency+Monetary, data=Cust_Master, FUN=c(mean,length))
head(Result)
## Recency Frequency Monetary sum.mean mean.mean median.mean length.mean
## 1 H H H 1801.3221 70.67305 70.89798 25.50299
## 2 H H M 1412.9514 62.26471 60.29514 22.74306
## 3 H L L 454.6667 49.09015 49.58333 9.50000
## 4 H L M 891.0000 68.53846 66.00000 13.00000
## 5 H M H 1572.1707 75.04506 76.95122 20.95122
## 6 H M L 742.5000 45.89179 45.91667 16.16667
## trans_date_final.mean.mean response.mean sum.length mean.length
## 1 52.90972 0.1831091 1338 1338
## 2 57.00995 0.1875000 144 144
## 3 50.44815 NA 6 6
## 4 58.25000 0.0000000 1 1
## 5 57.60931 0.2195122 41 41
## 6 56.58502 0.1666667 6 6
## median.length length.length trans_date_final.mean.length response.length
## 1 1338 1338 1338 1338
## 2 144 144 144 144
## 3 6 6 6 6
## 4 1 1 1 1
## 5 41 41 41 41
## 6 6 6 6 6
18.분석에 필요한 컬럼만 선택하여 컬럼명과 순서 변경: 결과 인식을 용이하게 하기 위해
names(Result)
## [1] "Recency" "Frequency"
## [3] "Monetary" "sum.mean"
## [5] "mean.mean" "median.mean"
## [7] "length.mean" "trans_date_final.mean.mean"
## [9] "response.mean" "sum.length"
## [11] "mean.length" "median.length"
## [13] "length.length" "trans_date_final.mean.length"
## [15] "response.length"
Result_F<-select(Result,-(contains("median")), -(starts_with("response")),-(starts_with("mean")))
Result_F<-rename(Result_F, Monetary.mean=sum.mean, Frequency.mean=length.mean,
Recency.mean=trans_date_final.mean.mean, Monetary.length=sum.length,
Frequency.length=length.length, Recency.length=trans_date_final.mean.length)
Result_F<-Result_F[,c(1,2,3,6,5,4,9,8,7)]
head(Result_F)
## Recency Frequency Monetary Recency.mean Frequency.mean Monetary.mean
## 1 H H H 52.90972 25.50299 1801.3221
## 2 H H M 57.00995 22.74306 1412.9514
## 3 H L L 50.44815 9.50000 454.6667
## 4 H L M 58.25000 13.00000 891.0000
## 5 H M H 57.60931 20.95122 1572.1707
## 6 H M L 56.58502 16.16667 742.5000
## Recency.length Frequency.length Monetary.length
## 1 1338 1338 1338
## 2 144 144 144
## 3 6 6 6
## 4 1 1 1
## 5 41 41 41
## 6 6 6 6
19. 엑셀 파일로 추출하여 값 확인하기
write.xlsx(Result_F, "RFM_Result.xlsx")