Projek Status : on-progress update description
Email : gabrielerichsonmrp@gmail.com
Linkedin : www.linkedin.com/in/gabrielerichson
Github : www.github.com/gabrielerichsonmrp
# Library
# Wrangling
library(tidyverse) #untuk wrangling
#visualization
library(plotly) # visualisasi chart
library(paletti) # warna
library(GGally) # Korelasi
library(glue) # pop up text pada chart
library(gridExtra) # Display 2 chart dalam 1 baris
#recomendation
library(recommenderlab) # Salah satu library yang membuat model recommendation# Kustomisasi Warna dan Visualisasi chart
my_color = c(
col1="#d3f2a3",
col2="#97e196",
col3="#6cc08b",
col4="#4c9b82",
col5="#217a79",
col6="#105965",
col7="#074050"
)
my_theme_fill <- get_scale_fill(get_pal(my_color))
my_theme_color <- get_scale_color(get_pal(my_color))
my_theme_hex <- get_hex(my_color)
color_dark_text = "#222629"
# MY PLOT THEME
my_plot_theme <- function (base_size, base_family="Segoe UI Semibold"){
dark_color="#222629"
facet_header = "#78767647"
dark_text = "#222629"
half_line <- base_size/2
theme_algoritma <- theme(
plot.background = element_rect(fill=NA,colour = NA), #background plot
plot.title = element_text(size = rel(1.2), margin = margin(b = half_line * 1.2),
color= dark_text, hjust = 0, family=base_family, face = "bold"),
plot.subtitle = element_text(size = rel(1.0), margin = margin(b = half_line * 1.2), color= dark_text, hjust=0),
plot.margin=unit(c(0.5,0.5,0.5,0.5),"cm"),
#plot.margin=unit(c(0.5,r=5,1,0.5),"cm"),
panel.background = element_rect(fill="#18181800",colour = "#e8e8e8"), #background chart
panel.border = element_rect(fill=NA,color = NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="#e8e8e8", linetype=2),
panel.grid.minor.y = element_blank(),
#panel.margin = unit(0.8*half_line, "mm"),
panel.margin.x = NULL,
panel.margin.y = NULL,
panel.ontop = FALSE,
panel.spacing = unit(1.2,"lines"),
legend.background = element_rect(fill="#18181800",colour = NA),
legend.text = element_text(size = rel(0.7),color=dark_text),
legend.title = element_text(colour = dark_text, size = base_size, lineheight = 0.8),
legend.box = NULL,
# text = element_text(colour = "white", size = base_size, lineheight = 0.9,
# angle = 0, margin = margin(), debug = FALSE),
axis.text = element_text(size = rel(0.8), color=dark_text),
axis.text.x = element_text(colour = dark_text, size = base_size, margin = margin(t = 0.8 * half_line/2)),
axis.text.y = element_text(colour = dark_text, size = base_size, margin = margin(r = 0.8 * half_line/2)),
axis.title.x = element_text(colour = dark_text, size = base_size, lineheight = 0.8,
margin = margin(t = 0.8 * half_line, b = 0.8 * half_line/2)),
axis.title.y = element_text(colour = dark_text, size = base_size, lineheight = 0.8,
angle = 90, margin = margin(r = 0.8 * half_line, l = 0.8 * half_line/2)),
axis.ticks = element_blank(),
strip.background = element_rect(fill=facet_header,colour = NA),
strip.text = element_text(colour = dark_text, size = rel(0.8)),
strip.text.x = element_text(margin = margin(t = half_line*0.8, b = half_line*0.8)),
strip.text.y = element_text(angle = -90, margin = margin(l = half_line, r = half_line)),
strip.switch.pad.grid = unit(0.1, "cm"),
strip.switch.pad.wrap = unit(0.1, "cm"),
complete = TRUE
)
}Pada Part 2 ini, kita akan melakukan analisis dan segmentasi customer berdasarkan RFM Value. Project ini terbagi menjadi 3 part yaitu:
Saya sangat menyarankan teman-teman membaca setiap part secara berurutan karena setiap part berhubungan.
Pada part 3 ini fokus kita untuk membentuk model untuk personalized product recommendatoin. Berikut 10 data teratas:
df_product_personalized <- readRDS("data_clean/df_product_personalized.rds")
head(df_product_personalized,10)Untuk membuat model ini kita akan menguji beberapa algoritma yaitu Association Rules, Item-based CF dan User-based CF menggunakan pendekatan pearson correlation, consine similarity dan jaccard distance.
Data yang kita miliki merupakan data histori transaksi, sehingga kita bisa menggunakan binaryRatingsMatrix dengan ketentuan Jika produk dibeli maka di set 1 dan jika tidak maka di set 0. Matriks jenis ini biasa disebut juga Sparse Matrix. Kita akan mencoba menggunakan matriks atas data Customer-Produk dan Invoice-Produk. Syarat yang harus dipenuhi dari sistem rekomendasi yaitu customer harus pernah melakukan transaksi atau produk pernah dibeli.
Matriks customer-product terdiri dari data histori transaksi setiap produk yang dibeli oleh masing-masing customer. Matriks ini bisa digunakan untuk memberikan rekomendasi yang bersifat bebas atau kapanpun. Misalkan pada aplikasi m-commerce, ketika user sudah login dan masuk kehalaman pertama maka pengguna akan mendapat rekomendasi langung.
df_product_personalized <- readRDS("data_clean/df_product_personalized.rds")
# drop NA Values of customer_id
df_custprod <- df_product_personalized %>% select(customer_id,stock_code)
df_custprod <- drop_na(df_custprod)
sapply(df_custprod[ ,c('customer_id','stock_code')], function(x) length(unique(x)))#> customer_id stock_code
#> 4293 3624
# Remove duplicate product per customer
df_custprod <- df_custprod %>% mutate(identifier = paste0(customer_id,"-",stock_code))
df_custprod <- df_custprod[!duplicated(df_custprod$identifier), ] %>% select(-identifier)rating_matrix_custprod <- df_custprod %>%
select(customer_id, stock_code) %>%
mutate(value = 1) %>%
spread(stock_code, value, fill = 0) %>%
select(-customer_id) %>%
as.matrix() %>%
as("binaryRatingMatrix")
rating_matrix_custprod#> 4293 x 3624 rating matrix of class 'binaryRatingMatrix' with 257561 ratings.
Matriks invoice-product sering digunakan untuk melakukan market based analysis. Matriks ini tidak memperdulikan siapa penggunanya, dia hanya peduli produk apa yang dibeli dalam suatu transaksi. Misalkan pada mcommerce, ketika pengguna memasukan produk ke keranjang belanja, maka sistem akan memberikan produk rekomedasi atas produk yang memiliki kemiripan dengan yang dikeranjang belanja pengguna. Untuk lebih detail, silahkan lihat ini Cross Selling and Market Basket Analysis
# Remove duplicate product per product
df_invprod <- df_product_personalized %>% select(invoice_no,stock_code)
df_invprod <- df_invprod %>% mutate(identifier = paste0(invoice_no,"-",stock_code))
df_invprod <- df_invprod[!duplicated(df_invprod$identifier), ] %>% select(-identifier)
sapply(df_invprod[ ,c('invoice_no','stock_code')], function(x) length(unique(x)))#> invoice_no stock_code
#> 18957 3768
rating_matrix_invprod <- df_invprod %>%
select(invoice_no, stock_code) %>%
mutate(value = 1) %>%
spread(stock_code, value, fill = 0) %>%
select(-invoice_no) %>%
as.matrix() %>%
as("binaryRatingMatrix")
rating_matrix_invprod#> 18957 x 3768 rating matrix of class 'binaryRatingMatrix' with 492627 ratings.
Sama dengan matriks invoice-produk sebelumnya, namun yang membedakan pada matriks ini seluruh data invoice yang memiliki missing value customer_id diexclude.
# Remove duplicate product per product
df_invprod_drop <- drop_na(df_product_personalized)
df_invprod_drop <- df_invprod_drop %>% select(invoice_no,stock_code)
df_invprod_drop <- df_invprod_drop %>% mutate(identifier = paste0(invoice_no,"-",stock_code))
df_invprod_drop <- df_invprod_drop[!duplicated(df_invprod_drop$identifier), ] %>% select(-identifier)
sapply(df_invprod_drop[ ,c('invoice_no','stock_code')], function(x) length(unique(x)))#> invoice_no stock_code
#> 17626 3624
rating_matrix_invprod_drop <- df_invprod_drop %>%
select(invoice_no, stock_code) %>%
mutate(value = 1) %>%
spread(stock_code, value, fill = 0) %>%
select(-invoice_no) %>%
as.matrix() %>%
as("binaryRatingMatrix")
rating_matrix_invprod_drop#> 17626 x 3624 rating matrix of class 'binaryRatingMatrix' with 369539 ratings.
Proses modeling menggunakan metode K-Fold Cross Validation menggunakan K=5 dengan proporsi train 80% dan tes 20% pada setiap fold.
# rating_matrix_custprod
scheme_custprod <- rating_matrix_custprod %>%
evaluationScheme(method = "cross-validation",k = 5, train = 0.8, given = -1)
# rating_matrix_invprod
scheme_invprod <- rating_matrix_invprod %>%
evaluationScheme(method = "cross-validation", k = 5, train = 0.8, given = -1)
#rating_matrix_invprod_drop
scheme_invprod_drop <- rating_matrix_invprod_drop %>%
evaluationScheme(method = "cross-validation", k = 5, train = 0.8, given = -1)algorithms_binary <- list(
"association_rules" = list(name = "AR", param = list(support = 0.05, confidence = 0.75)),
"popular" = list(name = "POPULAR", param = NULL),
"random" = list(name = "RANDOM", param = NULL),
"ibcf_jaccard_5" = list(name = "IBCF", param = list(k = 5)),
#"ibcf_jaccard_30" = list(name = "IBCF", param = list(k = 30)),
#"ibcf_jaccard_50" = list(name = "IBCF", param = list(k = 50)),
#"ibcf_jaccard_100" = list(name = "IBCF", param = list(k = 100)),
#"ibcf_jaccard_200" = list(name = "IBCF", param = list(k = 200)),
"ibcf_pearson_5" = list(name = "IBCF", param = list(method = "Pearson", k = 5)),
#"ibcf_pearson_30" = list(name = "IBCF", param = list(method = "Pearson", k = 30)),
#"ibcf_pearson_50" = list(name = "IBCF", param = list(method = "Pearson", k = 50)),
#"ibcf_pearson_200" = list(name = "IBCF", param = list(method = "Pearson", k = 200)),
"ibcf_cosine_100" = list(name = "IBCF", param = list(method = "Cosine", k = 100)),
#"ibcf_cosine_200" = list(name = "IBCF", param = list(method = "Cosine", k = 200)),
#"ubcf_jaccard_25" = list(name = "UBCF", param = list(nn = 25)),
#"ubcf_jaccard_50" = list(name = "UBCF", param = list(nn = 50)),
"ubcf_jaccard_100" = list(name = "UBCF", param = list(nn = 100)),
#"ubcf_jaccard_200" = list(name = "UBCF", param = list(nn = 200)),
#"ubcf_jaccard_300" = list(name = "UBCF", param = list(nn = 300)),
#"ubcf_pearson_25" = list(name = "UBCF", param = list(method = "Pearson", nn = 25)),
#"ubcf_pearson_50" = list(name = "UBCF", param = list(method = "Pearson", nn = 50)),
"ubcf_pearson_100" = list(name = "UBCF", param = list(method = "Pearson", nn = 100)),
#"ubcf_pearson_200" = list(name = "UBCF", param = list(method = "Pearson", nn = 200)),
#"ubcf_pearson_200" = list(name = "UBCF", param = list(method = "Pearson", nn = 300)),
#"ubcf_cosine_25" = list(name = "UBCF", param = list(method = "Cosine", nn = 25)),
#"ubcf_cosine_50" = list(name = "UBCF", param = list(method = "Cosine", nn = 50)),
"ubcf_cosine_100" = list(name = "UBCF", param = list(method = "Cosine", nn = 100))
#"ubcf_cosine_200" = list(name = "UBCF", param = list(method = "Cosine", nn = 200)),
#"ubcf_cosine_200" = list(name = "UBCF", param = list(method = "Cosine", nn = 300))
)
memory.limit(size=56000)
start <- Sys.time()
results_custprod <- recommenderlab::evaluate(scheme_custprod,
algorithms_binary,
type = "topNList",
n = c(1, 3, 5, 10, 15, 20))
results_invprod <- recommenderlab::evaluate(scheme_invprod,
algorithms_binary,
type = "topNList",
n = c(1, 3, 5, 10, 15, 20))
results_invprod_drop <- recommenderlab::evaluate(scheme_invprod_drop,
algorithms_binary,
type = "topNList",
n = c(1, 3, 5, 10, 15, 20))
end <- Sys.time()
cat('runtime', end - start)
wd <- as.character(getwd())
saveRDS(object=results_custprod, file=paste(paste(wd,"/modeling_recommendation/development/",sep = ""),
"results_custprod_fx.rds",sep=""))
saveRDS(object=results_invprod, file=paste(paste(wd,"/modeling_recommendation/development/",sep = ""),
"results_invprod_fx.rds",sep=""))
saveRDS(object=results_invprod_drop, file=paste(paste(wd,"/modeling_recommendation/development/",sep = ""),
"results_invprod_drop_fx.rds",sep=""))
results_custprod
results_invprod
results_invprod_dropresults_custprod <- readRDS("modeling_recommendation/development/results_custprod_fx.rds")
results_invprod <- readRDS("modeling_recommendation/development/results_invprod_fx.rds")
results_invprod_drop <- readRDS("modeling_recommendation/development/results_invprod_drop_fx.rds")
method_dis = function(n,x){
if(x==1){
sprintf(paste0(n))
}else{
sprintf(x)
}
}avg_conf_matr_personal <- function(results_custprod) {
tmp <- results_custprod %>%
getConfusionMatrix() %>%
as.list()
as.data.frame( Reduce("+",tmp) / length(tmp)) %>%
mutate(n = c(1, 3, 5, 10, 15, 20)) %>%
select('n', 'precision', 'recall', 'TPR', 'FPR')
}
results_tbl_personal <- results_custprod %>%
map(avg_conf_matr_personal) %>%
# Turning into an unnested tibble
enframe() %>%
# Unnesting to have all variables on same level
unnest()
#results_tbl_personal
results_tbl_personal %>%
ggplot(aes(FPR, TPR, colour = fct_reorder2(as.factor(name), FPR, TPR))) +
geom_line() +
geom_label(aes(label = n)) +
labs(title = "ROC curves using Matrix Customer-Product",
colour = "Model") +
theme_grey(base_size = 14)results_tbl_personal %>%
ggplot(aes(recall, precision,
colour = fct_reorder2(as.factor(name), precision, recall))) +
geom_line() +
geom_label(aes(label = n)) +
geom_text(aes(label = method_dis(name,n)),hjust=-1, size=3) +
labs(title = "Precision-Recall curves using Matrix Customer-Product",
colour = "Model") +
theme_grey(base_size = 14)avg_conf_matr_baskets <- function(results_invprod) {
tmp <- results_invprod %>%
getConfusionMatrix() %>%
as.list()
as.data.frame( Reduce("+",tmp) / length(tmp)) %>%
mutate(n = c(1, 3, 5, 10, 15, 20)) %>%
select('n', 'precision', 'recall', 'TPR', 'FPR')
}
results_tbl_baskets <- results_invprod %>%
map(avg_conf_matr_baskets) %>%
# Turning into an unnested tibble
enframe() %>%
# Unnesting to have all variables on same level
unnest()
#results_tbl_baskets
results_tbl_baskets %>%
ggplot(aes(FPR, TPR, colour = fct_reorder2(as.factor(name), FPR, TPR))) +
geom_line() +
geom_label(aes(label = n)) +
labs(title = "ROC curves using Matrix Invoice-Product",
colour = "Model") +
theme_grey(base_size = 14)results_tbl_baskets %>%
ggplot(aes(recall, precision,
colour = fct_reorder2(as.factor(name), precision, recall))) +
geom_line() +
geom_label(aes(label = n)) +
geom_text(aes(label = method_dis(name,n)),hjust=-1, size=3) +
labs(title = "Precision-Recall curves using Matrix Invoice-Product",
colour = "Model") +
theme_grey(base_size = 14)avg_conf_matr_baskets_drop <- function(results_invprod_drop) {
tmp <- results_invprod_drop %>%
getConfusionMatrix() %>%
as.list()
as.data.frame( Reduce("+",tmp) / length(tmp)) %>%
mutate(n = c(1, 3, 5, 10, 15, 20)) %>%
select('n', 'precision', 'recall', 'TPR', 'FPR')
}
results_tbl_baskets_drop <- results_invprod_drop %>%
map(avg_conf_matr_baskets_drop) %>%
# Turning into an unnested tibble
enframe() %>%
# Unnesting to have all variables on same level
unnest()
results_tbl_baskets_drop %>%
ggplot(aes(FPR, TPR, colour = fct_reorder2(as.factor(name), FPR, TPR)))+
geom_line()+
geom_label(aes(label = n))+
labs(title = "ROC curves using Matrix Invoice-Product",
colour = "Model")+
theme_grey(base_size = 14)results_tbl_baskets_drop %>%
ggplot(aes(recall, precision,
colour = fct_reorder2(as.factor(name), precision, recall))) +
geom_line() +
geom_label(aes(label = n)) +
geom_text(aes(label = method_dis(name,n)),hjust=-1, size=3) +
labs(title = "Precision-Recall curves using Matrix Invoice-Product",
colour = "Model") +
theme_grey(base_size = 14)Dari hasil evaluasi diatas, metode Item-based Collaborative Filtering menghasilkan nilai yang lebih baik pada ketiga matriks. Pada matriks invoice-product, jika hanya merokomendasikan 1 produk maka lebih baik menggunakan metode User-based CF karena bisa lihat garis cenderung jauh menurun apabila memberikan rekomendasi lebih dari 1. Oleh karena itu untuk enviroment production akan menggunakan metode rekomendasi Item-based Collaborative Filtering.
df_product_personalized <- readRDS("data_clean/df_product_personalized.rds")
df_customer_transaction <- readRDS("data_clean/df_customer_transaction.rds")
df_master_product <- df_customer_transaction %>% select(stock_code,description) %>% distinct() %>% arrange(stock_code)
glimpse(df_master_product)#> Observations: 3,624
#> Variables: 2
#> $ stock_code <chr> "10002", "10080", "10120", "10123C", "10124A", "10124G"...
#> $ description <chr> "INFLATABLE POLITICAL GLOBE", "GROOVY CACTUS INFLATABLE...
wd <- as.character(getwd())
saveRDS(object=df_master_product, file=paste(paste(wd,"/data_clean/",sep = ""),"df_master_product.rds",sep=""))
# drop NA Values of customer_id
df_custprod <- df_product_personalized %>% select(customer_id,stock_code)
df_custprod <- drop_na(df_custprod)
sapply(df_custprod[ ,c('customer_id','stock_code')], function(x) length(unique(x)))#> customer_id stock_code
#> 4293 3624
# Remove duplicate product per customer
df_custprod <- df_custprod %>% mutate(identifier = paste0(customer_id,"-",stock_code))
df_custprod <- df_custprod[!duplicated(df_custprod$identifier), ] %>% select(-identifier)
rating_matrix_custprod <- df_custprod %>%
select(customer_id, stock_code) %>%
mutate(value = 1) %>%
spread(stock_code, value, fill = 0) %>%
select(-customer_id) %>%
as.matrix() %>%
as("binaryRatingMatrix")
rating_matrix_custprod#> 4293 x 3624 rating matrix of class 'binaryRatingMatrix' with 257561 ratings.
model_ibcf_pearson_custprod <- Recommender(rating_matrix_custprod, method = "IBCF", param = list(method = "Pearson", k = 5))
wd <- getwd()
saveRDS(object=model_ibcf_pearson_custprod,
file=paste(paste(wd,"/modeling_recommendation/production/",sep = ""),"model_ibcf_pearson_custprod.rds",sep=""))model_ibcf_pearson_custprod <- readRDS("modeling_recommendation/production/model_ibcf_pearson_custprod.rds")
summary(model_ibcf_pearson_custprod@model)#> Length Class Mode
#> description 1 -none- character
#> sim 13133376 dgCMatrix S4
#> k 1 -none- numeric
#> method 1 -none- character
#> normalize_sim_matrix 1 -none- logical
#> alpha 1 -none- numeric
#> verbose 1 -none- logical
data.frame(
method = model_ibcf_pearson_custprod@model[1]$description,
correlation = model_ibcf_pearson_custprod@model[4]$method,
model_ibcf_pearson_custprod@model[3]
)Rekomendasi ini dihasilkan berdasarkan histori produk yang dibeli oleh customer. Misalkan kita simulasi untuk customer_id == 17850. Histori produk yang dibeli sebagai berikut:
model_ibcf_pearson_custprod <- readRDS("modeling_recommendation/production/model_ibcf_pearson_custprod.rds")
cust_id <- "17850"
cust_hist_product <- df_customer_transaction %>% filter(customer_id == cust_id) %>%
group_by(stock_code, description) %>%
summarise(total_quantity = sum(quantity),
total_monetary = sum(total_amount)) %>%
ungroup() %>%
arrange(desc(total_quantity,total_monetary)) %>%
mutate(scoring = row_number())
cust_hist_productKemudian siapkan matriks nya:
rating_matrix_histprod <- df_custprod %>%
select(stock_code) %>%
unique() %>%
mutate(value = as.numeric(stock_code %in% as.vector(cust_hist_product$stock_code))) %>%
spread(stock_code, value) %>%
as.matrix() %>%
as("binaryRatingMatrix")
rating_matrix_histprod#> 1 x 3624 rating matrix of class 'binaryRatingMatrix' with 21 ratings.
Berikut Top produk yang bisa direkomendasikan kepada customer_id == 17850 :
# Predict
predict_result_histprod <- predict(model_ibcf_pearson_custprod, newdata = rating_matrix_histprod, n= 20, type="topNList")
result_cust_recomm = NULL
if((length(predict_result_histprod@items$`1`))>0){
pred_prod_cust <- data.frame(
customer_id = cust_id,
recommendation_based="History Order",
stock_code = getList(predict_result_histprod)$`1`,
recom_rate = round(predict_result_histprod@ratings$`1`,2)
)
pred_prod_cust <- pred_prod_cust %>%
left_join(df_master_product, by=c("stock_code","stock_code")) %>%
select(customer_id,recommendation_based,stock_code,description,recom_rate)
result_cust_recomm <- rbind(result_cust_recomm,pred_prod_cust)
}
result_cust_recomm