Cohort Analysis 는 고객의 retention/chrun(이탈)에 대한 분석이다. 고객의 트렌드에 대한 이해를 높히며, 소비자 타켓을 구축하거나, 의사 결정에 도움을 주는 분석 기법이다.
UCI 의 Online Retail Data 를 활용하여 분석을 실시한다.
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus')
Online_Retail <- read_excel("Online Retail.xlsx")
View(Online_Retail)
str(Online_Retail)## tibble [541,909 x 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
중복되는 값들을 제거한다 - unique / duplicate 함수 사용
#--------------------------------------------------------------------
# 총 541909 개의 데이터 중, 중복이 없는 데이터는 536641 개이다.
#---------------------------------------------------------------------
nrow(unique(Online_Retail)) #536641## [1] 536641
## [1] 541909
#--------------------------------------------------------------------
# 중복 데이터의 수: 5268개
#---------------------------------------------------------------------
nrow(Online_Retail)-nrow(unique(Online_Retail))## [1] 5268
#--------------------------------------------------------------------
# Customer ID 결측치 135037개 확인
#---------------------------------------------------------------------
colSums(is.na(online.retail2))## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 1454 0 0 0
## CustomerID Country
## 135037 0
#--------------------------------------------------------------------
# as.Date 함수를 사용하여 정리 - Invoice Date
#---------------------------------------------------------------------
online.retail3$InvoiceDate <- as.Date(online.retail3$InvoiceDate, format="%m/%d/%Y")
#--------------------------------------------------------------------
# Year 함수를 사용하여 년도 추출 - Invoice Date
#---------------------------------------------------------------------
online.retail3$year <- as.numeric(format(online.retail3$InvoiceDate, '%Y'))
#--------------------------------------------------------------------
# 2011 년도 데이터만 추출
#---------------------------------------------------------------------
cohort2011 <- online.retail3[online.retail3$year == 2011,]
#--------------------------------------------------------------------
# Customer ID, Invoice Data, Year 선별을 통해 cohort 그룹 지정
#---------------------------------------------------------------------
cohort2011 %>%
select(CustomerID, InvoiceDate, year) -> cohort2011각 고객별 InvoiceDate 를 고객들이 JOIN DATE 로 지정
Cohort 2011 데이터에 JOIN DATE 와 구매 빈도를 병합 (Inner Join)
고객들의 JOIN DATE의 Month로 각 고객들을 라벨링 한다.
cohort 는 특정 시간에 비슷한 성향을 보이는 그룹을 말하는 것으로, 고객들이 실제 고객으로 되는 기준을 날짜로 정해준다. 각 마케팅 채널별로 세분화가 가능할 수 있다. 예를 들어, 온라인데이터에서 제공되는 날짜변수를 통해 월별/주별/분기별/년별 등으로 고객들을 세분화 시키는 분석법을 말한다.
#----------------------------------------------------------------------
# Cohort2011 데이터에서 각 ID 별 구매 횟수 데이터 프레임 생성
#--------------------------------------------------------------------
cohort2011 %>%
group_by(CustomerID) %>%
count() -> order.frequency
#----------------------------------------------------------------------
# Cohort2011 데이터에서 각 ID 별 최초 구매일 = JOIN DATE/MIN 함수 적용
#--------------------------------------------------------------------
cohort2011 %>%
group_by(CustomerID) %>%
summarise(InvoiceDate = min(InvoiceDate)) -> Join.date
#Renaming columns
colnames(Join.date)[2] <- "Join_Date"
#----------------------------------------------------------------------
# Cohort2011과 Join_date 병합
#--------------------------------------------------------------------
merge(cohort2011, Join.date,
by.x = "CustomerID",
by.y = "CustomerID",
all.x = TRUE) -> cohorts2011
#----------------------------------------------------------------------
# Month 를 추출하여, Cohort 변수 생성 Ex) JOIN_DATE가 1월이면 라벨링 1
#--------------------------------------------------------------------
as.numeric(format(cohorts2011$Join_Date, "%m")) -> cohorts2011$Cohort
DT::datatable(head(cohorts2011))가입 날짜 (Join Date) 와 구매 날짜 (Invoice Date) 의 차이를 구해서, Active customer에 대한 cohort 설정
#----------------------------------------------------------------------
# Invoice - Join = Days difftime 함수 사용
#--------------------------------------------------------------------
as.numeric(difftime(cohorts2011$InvoiceDate,
cohorts2011$Join_Date,
units = c("days"))) -> cohorts2011$days #Unit 지정: 일 별 계산
#----------------------------------------------------------------------
# days/30 = month & Invoice Date 일자 삭제
#--------------------------------------------------------------------
cohorts2011 %>%
mutate(month = round(days/30)) %>%
mutate(InvoiceDate = format(InvoiceDate, "%Y-%m")) %>%
mutate(Join_Date = format(Join_Date, "%Y-%m")) -> df_co
#----------------------------------------------------------------------
# Cohort 생성하기
#--------------------------------------------------------------------
c ("Jan Cohort",
"Feb Cohort",
"Mar Cohort",
"Apr Cohort",
"May Cohort",
"Jun Cohort",
"Jul Cohort",
"Aug Cohort",
"Sep Cohort",
"Oct Cohort",
"Nov Cohort",
"Dec Cohort") -> groups
for (i in 1:12) {
df_co[df_co$Cohort == i, "Cohort"] <- groups[i]
}
#----------------------------------------------------------------------
# Cohort factor level 고정값 지정
#--------------------------------------------------------------------
df_co$Cohort <- factor(df_co$Cohort,ordered = T,levels =c("Jan Cohort",
"Feb Cohort",
"Mar Cohort",
"Apr Cohort",
"May Cohort",
"Jun Cohort",
"Jul Cohort",
"Aug Cohort",
"Sep Cohort",
"Oct Cohort",
"Nov Cohort",
"Dec Cohort"))
#----------------------------------------------------------------------
# Cohort factor 변환
#--------------------------------------------------------------------
#str(cohorts2011$Cohort)
which(duplicated(df_co[-5:-6])) -> dup
df_1 <- df_co[-dup,]
#----------------------------------------------------------------------
# 프레임 변환하기
#--------------------------------------------------------------------
dcast(df_1, Cohort ~ month,
value.var = "CustomerID",
fun.aggregate = length) ->cohorts.wide
cw.retention <- cohorts.wide
cw.churn <-cohorts.wide
breaks <- quantile(cohorts.wide[,3:13], prob= seq(.05, .95, .05), na.rm = T)
colors <- sapply(round(seq(155,80, length.out = length(breaks)+1),0),
function(x){rgb(x,x,155, maxColorValue = 155)})
datatable(cohorts.wide,
class = 'cell-border stripe',
rownames = FALSE,
options = list(
ordering=F,
dom = 't',
pageLength = 12) ) %>%
formatStyle("0",
backgroundColor = 'lightgrey',
fontWeight = 'bold') %>%
formatStyle(names(cohorts.wide[c(-1,-2)]),fontWeight = 'bold',color = 'white',
backgroundColor = styleInterval(breaks,colors))## # A tibble: 401,604 x 11
## # Groups: CustomerID [4,372]
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <date> <dbl> <dbl>
## 1 536365 85123A WHITE HANG~ 6 2010-12-01 2.55 17850
## 2 536365 71053 WHITE META~ 6 2010-12-01 3.39 17850
## 3 536365 84406B CREAM CUPI~ 8 2010-12-01 2.75 17850
## 4 536365 84029G KNITTED UN~ 6 2010-12-01 3.39 17850
## 5 536365 84029E RED WOOLLY~ 6 2010-12-01 3.39 17850
## 6 536365 22752 SET 7 BABU~ 2 2010-12-01 7.65 17850
## 7 536365 21730 GLASS STAR~ 6 2010-12-01 4.25 17850
## 8 536366 22633 HAND WARME~ 6 2010-12-01 1.85 17850
## 9 536366 22632 HAND WARME~ 6 2010-12-01 1.85 17850
## 10 536367 84879 ASSORTED C~ 32 2010-12-01 1.69 13047
## # ... with 401,594 more rows, and 4 more variables: Country <chr>, year <dbl>,
## # Month <date>, unit <chr>
Time Cohorts : 제품과 서비스를 사용하기 시작한 시간 기준 (Month/ Quarter 기준)
Behavior Cohorts : Product 타입 기준 -> Design custome-made Service or product 전략 수립
Size Cohort : 제품과 서비스를 구매한 고객의 사이즈 기준: 특별 기간 동안 쓴 소비액
Online_Retail -> retail
#------------------------------------------------------------------------
# NA 제거
#------------------------------------------------------------------------
retail %>%
drop_na(CustomerID) -> retail
#------------------------------------------------------------------------
# 중복제거
#------------------------------------------------------------------------
retail[!duplicated(retail),] ->retail
summary(retail)## InvoiceNo StockCode Description Quantity
## Length:401604 Length:401604 Length:401604 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.18
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.00 Min. :12346
## 1st Qu.:2011-04-06 15:02:00 1st Qu.: 1.25 1st Qu.:13939
## Median :2011-07-29 15:40:00 Median : 1.95 Median :15145
## Mean :2011-07-10 12:08:23 Mean : 3.47 Mean :15281
## 3rd Qu.:2011-10-20 11:58:30 3rd Qu.: 3.75 3rd Qu.:16784
## Max. :2011-12-09 12:50:00 Max. :38970.00 Max. :18287
## Country
## Length:401604
## Class :character
## Mode :character
##
##
##
#------------------------------------------------------------------------
# 0 이상의 데이터만 추출 : MIN 값이 0 인경우 모델링에 negative affect, 제거
#------------------------------------------------------------------------
retail %>%
filter(Quantity >0 & UnitPrice > 0) -> retailInvoice Period : year/month 추출
Cohort Group : year/month 기반 고객 첫 구매일 추출
Cohort period/Cohort Index : 구매일 이후의 month 의 수
library(lubridate)
#------------------------------------------------------------------------
# ID 별 첫 구매 추출 = retail_cohort
#------------------------------------------------------------------------
retail %>%
group_by(CustomerID) %>%
summarise(Month = floor_date(min(InvoiceDate), unit = "month")) -> retail_cohort
#------------------------------------------------------------------------
# InvoiceDate 에서 Month 변수 생성 = retail
#------------------------------------------------------------------------
retail %>%
mutate(InvoiceMonth = floor_date(InvoiceDate, unit = "month")) -> retail
#------------------------------------------------------------------------
# retail_cohort + retail 병합= 각 ID 별 최초 구매 일자를 더해준다.
#------------------------------------------------------------------------
merge( retail, retail_cohort,
by.x= "CustomerID",
by.y = "CustomerID") -> retail_merge
retail_merge %>%
mutate(InvoiceMonth_num = month(InvoiceDate), #구입 한 날의 월/년도
InvoiceYear_num = year(InvoiceDate),
CohorMonth_num = month(Month), #구입 최초 일의 월/년
CohortYear_num = year(Month),
Index = (InvoiceYear_num-CohortYear_num)*12 +(InvoiceMonth_num - CohorMonth_num)+1) -> retail_merge
#------------------------------------------------------------------------
# 최초 구매일, Index 기준 customer의 수 누적 표
#------------------------------------------------------------------------
retail_merge %>%
group_by(Month, Index) %>% #최초구매일, Index
summarise(Total_Customer = n_distinct(CustomerID)) %>%
pivot_wider(names_from = Index, values_from = Total_Customer) %>%
rename(DEC = 2,
JAN = 3,
FEB = 4,
MAR = 5,
APR = 6,
MAY = 7,
JUN = 8,
JUL = 9,
AUG = 10,
SEP = 11,
OCT = 12,
NOV = 13,
DEC_A = 14) -> cohort_counts
#------------------------------------------------------------------------
# 최초 기준 비율 계산 표
#------------------------------------------------------------------------
cohort_counts %>%
mutate(TOTAL = round(DEC/DEC,3)*100,
JAN_R = round(JAN/DEC,3) *100,
FEB_R = round(FEB/DEC,3) *100,
MAR_R = round(MAR/DEC,3) *100,
APR_R = round(APR/DEC,3) *100,
MAY_R = round(MAR/DEC,3) *100,
JUN_R = round(JUN/DEC,3) *100,
JUL_R = round(JUL/DEC,3) *100,
AUG_R = round(AUG/DEC,3) *100,
SEP_R = round(SEP/DEC,3) *100,
OCT_R = round(OCT/DEC,3) *100,
NOV_R = round(NOV/DEC,3) *100,
DEC_R = round(DEC_A/DEC,3) *100) %>%
select(-c(2:14)) -> retention
retention## # A tibble: 13 x 14
## # Groups: Month [13]
## Month TOTAL JAN_R FEB_R MAR_R APR_R MAY_R JUN_R JUL_R AUG_R
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-12-01 00:00:00 100 36.6 32.3 38.4 36.3 38.4 36.3 34.9 35.4
## 2 2011-01-01 00:00:00 100 22.1 26.6 23 32.1 23 24.7 24.2 30
## 3 2011-02-01 00:00:00 100 18.7 18.7 28.4 27.1 28.4 25.3 27.9 24.7
## 4 2011-03-01 00:00:00 100 15 25.2 19.9 22.3 19.9 26.8 23 27.9
## 5 2011-04-01 00:00:00 100 21.3 20.3 21 19.7 21 21.7 26 7.3
## 6 2011-05-01 00:00:00 100 19 17.3 17.3 20.8 17.3 26.4 9.5 NA
## 7 2011-06-01 00:00:00 100 17.4 15.7 26.4 23.1 26.4 9.5 NA NA
## 8 2011-07-01 00:00:00 100 18.1 20.7 22.3 27.1 22.3 NA NA NA
## 9 2011-08-01 00:00:00 100 20.7 24.9 24.3 12.4 24.3 NA NA NA
## 10 2011-09-01 00:00:00 100 23.4 30.1 11.4 NA 11.4 NA NA NA
## 11 2011-10-01 00:00:00 100 24 11.5 NA NA NA NA NA NA
## 12 2011-11-01 00:00:00 100 11.1 NA NA NA NA NA NA NA
## 13 2011-12-01 00:00:00 100 NA NA NA NA NA NA NA NA
## # ... with 4 more variables: SEP_R <dbl>, OCT_R <dbl>, NOV_R <dbl>, DEC_R <dbl>
#------------------------------------------------------------------------
# 시각화
#------------------------------------------------------------------------
retention %>%
pivot_longer(cols = c(2:14),
names_to = "Index",
values_to = "Rate") %>%
mutate(Index = row_number() %>% as.factor()) %>%
arrange(Month) %>%
ungroup() %>%
mutate(Cohort_Month = ymd(Month) %>% #ymd 함수를 써야 제대로 나옴
as.factor()) -> viz_retention
viz_retention %>%
mutate(Cohort_Month = factor(Cohort_Month, levels = rev(levels(Cohort_Month)))) %>%
ggplot(aes(Index, Cohort_Month))+
geom_tile(aes(fill=Rate))+
geom_text(aes(label=Rate),size = 2)+
scale_fill_gradientn(
colours = c("White", "#e6d0a1", "#ff0000"),
na.value = "White") +
labs(title ="Retention Rate (in %)") #----------------------------------------------------------------------
# 평균 구매에 대한 Cohort
#---------------------------------------------------------------------
retail_merge %>%
group_by(Month, Index) %>%
summarise(avg_quan = round(mean(Quantity),1)) %>%
arrange(Month) %>%
ungroup() %>%
mutate(Cohort_Month = ymd(Month) %>%
as.factor(),
Index = as.factor(Index)) %>%
ggplot(aes(x= Index, y=Cohort_Month)) +
geom_tile(aes(fill=avg_quan))+
geom_text(aes(label= avg_quan))+
scale_fill_gradientn(
colours = c("White", "#188508", "#016000"),
na.value = c("White"))+
theme_bw()R : 최근
F : 자주
M : 금액
#------------------------------------------------------------------------
# 총 금액 구하기
#------------------------------------------------------------------------
retail_merge %>%
mutate(Purchase = (Quantity * UnitPrice)) -> retail_merge
#------------------------------------------------------------------------
# 최초 구매일
#------------------------------------------------------------------------
max(retail_merge$InvoiceDate)## [1] "2011-12-09 12:50:00 UTC"
## [1] "2011-12-10 12:50:00 UTC"
## [1] "2010-12-01 08:26:00 UTC"
#------------------------------------------------------------------------
# RFM
# R : 마지막 구매 날짜 - 구매 날짜
# F : Count
# M : Quantity * Price
#------------------------------------------------------------------------
retail_merge %>%
group_by(CustomerID) %>%
summarise(Recency = as.numeric(round(snapshot_date-max(InvoiceDate),0)),
Frequency = n(),
Monetary = sum(Purchase)) -> rfm
head(rfm)## # A tibble: 6 x 4
## CustomerID Recency Frequency Monetary
## <dbl> <dbl> <int> <dbl>
## 1 12346 326 1 77184.
## 2 12347 3 182 4310
## 3 12348 76 31 1797.
## 4 12349 19 73 1758.
## 5 12350 311 17 334.
## 6 12352 37 85 2506.
#ntile() 함수는 분위수를 계산해주며, n 인자를 통해 몇 분위로 나눌지 선택할 수 있습니다. 해당 함수 역시 #오름차순으로 분위수를 나눕니다
rfm %>%
mutate(R = ntile(-Recency,4),
Fr = ntile(Frequency,4),
M = ntile(Monetary,4),
rfm_score= R+Fr+M) -> rfm
#-----------------------------------------------------------------------
# Gold/Sliver/Bronze
#---------------------------------------------------------------------------
ifelse(rfm$rfm_score > 9 , "Gold",
ifelse(rfm$rfm_score > 5 & rfm$rfm_score <=9, "Sliver", "Bronze")) ->rfm$segment
#-----------------------------------------------------------------------
# 각 segement rfm 평균
#---------------------------------------------------------------------------
rfm %>%
group_by(segment) %>%
summarise(Recency = mean(Recency),
Frequency = mean(Frequency),
Monetary = mean(Monetary)) %>%
arrange(desc(Recency)) ->rfm.seg#-------------------------------------------------
# Step 1) Scaling
#--------------------------------------------------
scale(rfm[,2:4]) -> rfm_scale
dist(rfm_scale, method ="euclidean") %>%
as.matrix() -> rfm.dist
#-------------------------------------------------
# Step 2) K 갯수 = 3개로 설정
#--------------------------------------------------
fviz_nbclust(rfm_scale,
kmeans,
method = "wss",
k.max=9)#-------------------------------------------------
# Step 3) Kmean clustering
#--------------------------------------------------
kmeans(rfm_scale,
centers = 3,
iter.max = 1000) -> rfm.kmeans
#-------------------------------------------------
# Step 4) 군집별 평균 값 확인
#--------------------------------------------------
rfm.kmeans$centers## Recency Frequency Monetary
## 1 -0.8804868 10.84400785 13.79293005
## 2 -0.5117712 0.04986022 -0.00282326
## 3 1.5463756 -0.28003576 -0.15740003
#-------------------------------------------------
# Step 5) cluster 할당
#--------------------------------------------------
as.factor(rfm.kmeans$cluster) -> rfm$cluster_kmean
#-------------------------------------------------
# Step 6) the characteristics of each cluster
#--------------------------------------------------
rfm %>%
group_by(cluster_kmean) %>%
summarise(R = mean(Recency),
F = mean(Frequency),
M = mean(Monetary)) %>%
arrange(desc(R)) -> rfm_kmeansset.seed(2021)
#----------------------------------------------------
# K-medoid 에 부합하는 K 갯수 구하기
#---------------------------------------------------
fviz_nbclust(rfm_scale,
pam,
method = "wss",
k.max=9)#----------------------------------------------------
# K-medoid 실행 pam 함수
#---------------------------------------------------
pam(rfm_scale, k=3) ->rfm.kmedoid
plot(rfm.kmedoid)barplot(t(rfm.kmedoid$medoids),beside = T, col=2:4)
legend("topright", colnames(rfm_scale), fill=2:14, cex=0.8)#------------------------------------------------------
# Clustering 할당
#-----------------------------------------------------
as.factor(rfm.kmedoid$clustering) -> rfm$medoid_cluster
#------------------------------------------------------
# Characteristic for each K-medoids clusters
#-----------------------------------------------------
rfm %>%
group_by(medoid_cluster) %>%
summarise(R = round(mean(Recency),2),
F = round(mean(Frequency),2),
M = round(mean(Monetary),2)) %>%
arrange(desc(R)) -> rfm_med
DT::datatable(rfm_med)Cluster 1 : Recency 가 180 이상으로, 6개월 동안 구매 내역이 없는 그룹이다. 반면 빈도는 25, 평균 구매액은 GBP594 Cluster 2 : Recnecy 가 98 으로 3개월 동안 구매 내역이 없는 그룹이다. 빈도수는 41으로, 평균 861 을 구매했다. Cluster 3 : Recency 가 22 으로 마지막 구매일로 부터, 30일 이내 구매ㅎ를 한 그룹으로 142 빈도, 평균 3250 GBP 사용