Project Status: in-progress
Last Update: 10 April 2020
Olist adalah marketplace yang beroperasi sebagai perusahaan teknologi SaaS (Software as a Service) di segmen e-commerce sejak 2015. Olist menawarkan solusi bagi pemilik toko dari semua ukuran untuk meningkatkan penjualan mereka melalui platform online.
Dataset yang digunakan berisi informasi transaksi dari September 2016 sampai Oktober 2018. Dataset ini didownload dari https://www.kaggle.com/olistbr/brazilian-ecommerce
#eval=TRUE
customers <- read_csv("data_raw/olist_customers_dataset.csv")
geolocation <- read_csv("data_raw/olist_geolocation_dataset.csv")
sellers <- read_csv("data_raw/olist_sellers_dataset.csv")
products <- read_csv("data_raw/olist_products_dataset.csv")
product_category <- read_csv("data_raw/product_category_name_translation.csv")
orders <- read_csv("data_raw/olist_orders_dataset.csv")
order_items <- read_csv("data_raw/olist_order_items_dataset.csv")
order_payments <- read_csv("data_raw/olist_order_payments_dataset.csv")
order_reviews <- read_csv("data_raw/olist_order_reviews_dataset.csv")
## **brazil_abbr-external source:** https://brazil-help.com/brazilian_states.htm
brazil_abbr <- readxl::read_xlsx("data_raw/brazil_abbr.xlsx")
wd <- as.character(getwd())
#customers
saveRDS(object=customers, file=paste(paste(wd,"/data_input/",sep = ""),"customers.rds",sep=""))
#sellers
saveRDS(object=sellers, file=paste(paste(wd,"/data_input/",sep = ""),"sellers.rds",sep=""))
#brazil_abbr
saveRDS(object=brazil_abbr, file=paste(paste(wd,"/data_input/",sep = ""),"brazil_abbr.rds",sep=""))
#geolocation
geolocation <- geolocation %>%
group_by(geolocation_zip_code_prefix) %>%
slice(1) %>%
ungroup() %>%
arrange(geolocation_lat,geolocation_lng)
geolocation <- sqldf("select g.*, b.state_name,b.region from geolocation g
left join brazil_abbr b on g.geolocation_state=b.state_code")
saveRDS(object=geolocation, file=paste(paste(wd,"/data_input/",sep = ""),"geolocation.rds",sep=""))
#product_category
colnames(product_category) <- c("product_category_name","product_category_name_english")
saveRDS(object=product_category, file=paste(paste(wd,"/data_input/",sep = ""),"product_category.rds",sep=""))
# set unique product code cause product id is to long (subjective)
generateProductCode <- function(index){
sapply(index, function(x) if(x > 9999) paste("PR",x,sep = "")
else if (x > 999) paste("PR0",x,sep = "")
else if (x > 99) paste("PR00",x,sep = "")
else if (x > 9) paste("PR000",x,sep = "")
else paste("PR0000",x,sep = "") )
}
#products
products <- left_join(products,product_category,by=c("product_category_name","product_category_name")) %>%
mutate(
product_category_name = ifelse(is.na(product_category_name_english),
as.character(product_category_name),as.character(product_category_name_english))
) %>%
select(-product_category_name_english) %>%
mutate(
product_category_name = ifelse(product_category_name=="",NA,product_category_name),
product_code = generateProductCode(row_number())
) %>%
select(product_id,product_code,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,
product_weight_g, product_length_cm, product_height_cm, product_width_cm)
saveRDS(object=products, file=paste(paste(wd,"/data_input/",sep = ""),"products.rds",sep=""))
#orders
saveRDS(object=orders, file=paste(paste(wd,"/data_input/",sep = ""),"orders.rds",sep=""))
#order_items
saveRDS(object=order_items, file=paste(paste(wd,"/data_input/",sep = ""),"order_items.rds",sep=""))
#order_payments
saveRDS(object=order_payments, file=paste(paste(wd,"/data_input/",sep = ""),"order_payments.rds",sep=""))
#order_reviews
saveRDS(object=order_reviews, file=paste(paste(wd,"/data_input/",sep = ""),"order_reviews.rds",sep=""))
wd <- as.character(getwd())
customers <- readRDS("data_input/customers.rds")
geolocation <- readRDS("data_input/geolocation.rds")
sellers <- readRDS("data_input/sellers.rds")
product_category <- readRDS("data_input/product_category.rds")
products <- readRDS("data_input/products.rds")
orders <- readRDS("data_input/orders.rds")
order_items <- readRDS("data_input/order_items.rds")
order_payments <- readRDS("data_input/order_payments.rds")
order_reviews <- readRDS("data_input/order_reviews.rds")
brazil_abbr <- readRDS("data_input/brazil_abbr.rds")
Data Structure
#> Observations: 99,441
#> Variables: 5
#> $ customer_id <chr> "06b8999e2fba1a1fbc88172c00ba8bc7", "18955...
#> $ customer_unique_id <chr> "861eff4711a542e4b93843c6dd7febb0", "290c7...
#> $ customer_zip_code_prefix <chr> "14409", "09790", "01151", "08775", "13056...
#> $ customer_city <chr> "franca", "sao bernardo do campo", "sao pa...
#> $ customer_state <chr> "SP", "SP", "SP", "SP", "SP", "SC", "SP", ...
Missing Value
#> customer_id customer_unique_id customer_zip_code_prefix
#> 0 0 0
#> customer_city customer_state
#> 0 0
Duplicate Value
data.frame(
data = length(customers$customer_id),
data_unique=length(unique(customers$customer_id))
)
Data Structure
#> Observations: 19,015
#> Variables: 7
#> $ geolocation_zip_code_prefix <chr> "98780", "28155", "96255", "96100", "96...
#> $ geolocation_lat <dbl> -36.60537, -34.58642, -33.69142, -33.52...
#> $ geolocation_lng <dbl> -64.28395, -58.73210, -53.48016, -53.36...
#> $ geolocation_city <chr> "santa rosa", "santa maria", "chui", "s...
#> $ geolocation_state <chr> "RS", "RJ", "RS", "RS", "RS", "RS", "RS...
#> $ state_name <chr> "Rio Grande do Sul", "Rio de Janeiro", ...
#> $ region <chr> "South", "Southeast", "South", "South",...
geolocation <- geolocation %>%
mutate(
geolocation_state = as.factor(geolocation_state),
state_name = as.factor(state_name),
region = as.factor(region)
)
Missing Value
#> geolocation_zip_code_prefix geolocation_lat
#> 0 0
#> geolocation_lng geolocation_city
#> 0 0
#> geolocation_state state_name
#> 0 0
#> region
#> 0
Duplicate Value
data.frame(
data = length(geolocation$geolocation_zip_code_prefix),
data_unique=length(unique(geolocation$geolocation_zip_code_prefix))
)
Data Structure
#> Observations: 3,095
#> Variables: 4
#> $ seller_id <chr> "3442f8959a84dea7ee197c632cb2df15", "d1b65fc...
#> $ seller_zip_code_prefix <chr> "13023", "13844", "20031", "04195", "12914",...
#> $ seller_city <chr> "campinas", "mogi guacu", "rio de janeiro", ...
#> $ seller_state <chr> "SP", "SP", "RJ", "SP", "SP", "RJ", "PE", "S...
Missing Value
#> seller_id seller_zip_code_prefix seller_city
#> 0 0 0
#> seller_state
#> 0
Duplicate Value
Data Structure
#> Observations: 32,951
#> Variables: 10
#> $ product_id <chr> "1e9e8ef04dbcff4541ed26657ea517e5", "3aa...
#> $ product_code <chr> "PR00001", "PR00002", "PR00003", "PR0000...
#> $ product_category_name <chr> "perfumery", "art", "sports_leisure", "b...
#> $ product_name_lenght <dbl> 40, 44, 46, 27, 37, 60, 56, 56, 57, 36, ...
#> $ product_description_lenght <dbl> 287, 276, 250, 261, 402, 745, 1272, 184,...
#> $ product_photos_qty <dbl> 1, 1, 1, 1, 4, 1, 4, 2, 1, 1, 1, 4, 3, 2...
#> $ product_weight_g <dbl> 225, 1000, 154, 371, 625, 200, 18350, 90...
#> $ product_length_cm <dbl> 16, 30, 18, 26, 20, 38, 70, 40, 27, 17, ...
#> $ product_height_cm <dbl> 10, 18, 9, 4, 17, 5, 24, 8, 13, 10, 10, ...
#> $ product_width_cm <dbl> 14, 20, 15, 26, 13, 11, 44, 40, 17, 12, ...
Missing Value
#> product_id product_code
#> 0 0
#> product_category_name product_name_lenght
#> 610 610
#> product_description_lenght product_photos_qty
#> 610 610
#> product_weight_g product_length_cm
#> 2 2
#> product_height_cm product_width_cm
#> 2 2
cukup banyak data produk yang kosong, namun kita biarkan saja dulu karena variabel tersebut kurang berpengaruh pada projek ini.
Duplicate Value
Data Structure
#> Observations: 99,441
#> Variables: 8
#> $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
#> $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "...
#> $ order_status <chr> "delivered", "delivered", "delivered"...
#> $ order_purchase_timestamp <dttm> 2017-10-02 10:56:33, 2018-07-24 20:4...
#> $ order_approved_at <dttm> 2017-10-02 11:07:15, 2018-07-26 03:2...
#> $ order_delivered_carrier_date <dttm> 2017-10-04 19:55:00, 2018-07-26 14:3...
#> $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2018-08-07 15:2...
#> $ order_estimated_delivery_date <dttm> 2017-10-18, 2018-08-13, 2018-09-04, ...
Ubah tipe data order_status menjadi factor:
Missing Value
#> order_id customer_id
#> 0 0
#> order_status order_purchase_timestamp
#> 0 0
#> order_approved_at order_delivered_carrier_date
#> 160 1783
#> order_delivered_customer_date order_estimated_delivery_date
#> 2965 0
Duplicate Value
Data Structure
#> Observations: 112,650
#> Variables: 7
#> $ order_id <chr> "00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2...
#> $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,...
#> $ product_id <chr> "4244733e06e7ecb4970a6e2683c13e61", "e5f2d52b80...
#> $ seller_id <chr> "48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1...
#> $ shipping_limit_date <dttm> 2017-09-19 09:45:35, 2017-05-03 11:05:13, 2018...
#> $ price <dbl> 58.90, 239.90, 199.00, 12.99, 199.90, 21.90, 19...
#> $ freight_value <dbl> 13.29, 19.93, 17.87, 12.79, 18.14, 12.69, 11.85...
Missing Value
#> order_id order_item_id product_id seller_id
#> 0 0 0 0
#> shipping_limit_date price freight_value
#> 0 0 0
Duplicate Value
Data Structure
#> Observations: 103,886
#> Variables: 5
#> $ order_id <chr> "b81ef226f3fe1789b1e8b2acac839d17", "a9810da82...
#> $ payment_sequential <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
#> $ payment_type <chr> "credit_card", "credit_card", "credit_card", "...
#> $ payment_installments <dbl> 8, 1, 1, 8, 2, 2, 1, 3, 6, 1, 8, 1, 1, 5, 4, 1...
#> $ payment_value <dbl> 99.33, 24.39, 65.71, 107.78, 128.45, 96.12, 81...
Merubah tipe data payment_type:
Missing Value
#> order_id payment_sequential payment_type
#> 0 0 0
#> payment_installments payment_value
#> 0 0
Duplicate Value
data.frame(
data = length(order_payments$order_id),
data_unique=length(unique(order_payments$order_id))
)
Terdapat data pembayaran yang duplikat karena terdapat order yang melakukan pembayaran secara cicilan sehingga tidak menjadi masalah, namun nantinya tetap akan disesuaikan dengan kebutuhan.
Data Structure
#> Observations: 100,000
#> Variables: 7
#> $ review_id <chr> "7bc2406110b926393aa56f80a40eba40", "80e641...
#> $ order_id <chr> "73fc7af87114b39712e6da79b0a377eb", "a54891...
#> $ review_score <dbl> 4, 5, 5, 5, 5, 1, 5, 5, 5, 4, 5, 5, 4, 4, 3...
#> $ review_comment_title <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "recome...
#> $ review_comment_message <chr> NA, NA, NA, "Recebi bem antes do prazo esti...
#> $ review_creation_date <dttm> 2018-01-18, 2018-03-10, 2018-02-17, 2017-0...
#> $ review_answer_timestamp <dttm> 2018-01-18 21:46:59, 2018-03-11 03:05:13, ...
Missing Value
#> review_id order_id review_score
#> 0 0 0
#> review_comment_title review_comment_message review_creation_date
#> 88287 58255 0
#> review_answer_timestamp
#> 0
Untuk sementara data review_comment_title dan review_comment_message yang NULL tidak masalah karena tidak masuk ke goal yang ingin dicapai.
Duplicate Value
data.frame(
data = length(order_reviews$review_id),
data_unique=length(unique(order_reviews$review_id))
)
Terdapat data review yang duplikat karena terdapat order yang memberikan review lebih dari 1 kali.
Berikut ini merupakan data transaksi customer yang didapat dari hasil merge/join antara data customer, orders, order_item, order_review, seller dan geolocation. Berikut struktur data dan pengecekan missing value:
#Customer Order
customer_order <- customers %>%
left_join(geolocation,by=c("customer_zip_code_prefix"="geolocation_zip_code_prefix")) %>%
select(-c(geolocation_city,customer_state)) %>%
rename(customer_lat=geolocation_lat, customer_lng=geolocation_lng,
customer_state_code=geolocation_state, customer_state_name=state_name,customer_region=region) %>%
left_join(orders, by=c("customer_id")) %>%
left_join(order_items, by=c("order_id")) %>%
left_join(products, by=c("product_id")) %>%
select(-c(product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,
product_length_cm,product_height_cm,product_width_cm)) %>%
left_join(sellers, by=c("seller_id")) %>%
left_join(geolocation, by=c("seller_zip_code_prefix"="geolocation_zip_code_prefix")) %>%
select(-c(geolocation_city,seller_state)) %>%
rename(seller_lat=geolocation_lat, seller_lng=geolocation_lng,
seller_state_code=geolocation_state, seller_state_name=state_name,seller_region=region)
order_review <- orders %>%
left_join(order_reviews,by=c("order_id")) %>%
group_by(order_id) %>%
summarise(
review_score = round(mean(review_score))
) %>%
ungroup()
customer_order <- customer_order %>% left_join(order_review, by=c("order_id"))
customer_order <- customer_order %>% select(
order_id, order_status, order_purchase_timestamp,order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,
customer_lat,customer_lng,customer_city,customer_state_code, customer_state_name, customer_region, order_item_id,
product_id, product_code, product_category_name, price, freight_value, review_score, seller_id, shipping_limit_date, seller_zip_code_prefix, seller_lat, seller_lng, seller_city, seller_state_code, seller_state_name, seller_region) %>%
arrange(order_purchase_timestamp,order_id,order_item_id) %>%
filter(!(!order_status %in% c("canceled","unavailable") & is.na(product_id)))
Data Structure:
#> Observations: 113,417
#> Variables: 31
#> $ order_id <chr> "2e7a8482f6fb09756ca50c10d7bfc047", "...
#> $ order_status <fct> shipped, shipped, canceled, canceled,...
#> $ order_purchase_timestamp <dttm> 2016-09-04 21:15:19, 2016-09-04 21:1...
#> $ order_approved_at <dttm> 2016-10-07 13:18:03, 2016-10-07 13:1...
#> $ order_delivered_carrier_date <dttm> 2016-10-18 13:14:51, 2016-10-18 13:1...
#> $ order_delivered_customer_date <dttm> NA, NA, NA, NA, 2016-11-09 07:47:38,...
#> $ order_estimated_delivery_date <dttm> 2016-10-20, 2016-10-20, 2016-10-28, ...
#> $ customer_unique_id <chr> "b7d76e111c89f7ebf14761390f0f7d17", "...
#> $ customer_zip_code_prefix <chr> "69309", "69309", "99025", "12244", "...
#> $ customer_lat <dbl> 2.812997, 2.812997, -28.261098, -23.2...
#> $ customer_lng <dbl> -60.69526, -60.69526, -52.40767, -45....
#> $ customer_city <chr> "boa vista", "boa vista", "passo fund...
#> $ customer_state_code <fct> RR, RR, RS, SP, SP, SP, SP, SP, SP, R...
#> $ customer_state_name <fct> Roraima, Roraima, Rio Grande do Sul, ...
#> $ customer_region <fct> North, North, South, Southeast, South...
#> $ order_item_id <dbl> 1, 2, 1, NA, 1, 2, 3, 1, 1, 1, 1, 1, ...
#> $ product_id <chr> "c1488892604e4ba5cff5b4eb4d595400", "...
#> $ product_code <chr> "PR29132", "PR12485", "PR09187", NA, ...
#> $ product_category_name <chr> "furniture_decor", "furniture_decor",...
#> $ price <dbl> 39.99, 32.90, 59.50, NA, 44.99, 44.99...
#> $ freight_value <dbl> 31.67, 31.67, 15.56, NA, 2.83, 2.83, ...
#> $ review_score <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 3, 1...
#> $ seller_id <chr> "1554a68530182680ad5c8b042c3ab563", "...
#> $ shipping_limit_date <dttm> 2016-10-26 18:25:19, 2016-10-26 18:2...
#> $ seller_zip_code_prefix <chr> "37580", "37580", "81050", NA, "81810...
#> $ seller_lat <dbl> -22.42881, -22.42881, -25.49504, NA, ...
#> $ seller_lng <dbl> -46.56955, -46.56955, -49.29960, NA, ...
#> $ seller_city <chr> "monte siao", "monte siao", "curitiba...
#> $ seller_state_code <fct> MG, MG, PR, NA, PR, PR, PR, SP, PR, S...
#> $ seller_state_name <fct> Minas Gerais, Minas Gerais, Paraná, N...
#> $ seller_region <fct> Southeast, Southeast, South, NA, Sout...
Missing Value:
#> order_id order_status
#> 0 0
#> order_purchase_timestamp order_approved_at
#> 0 156
#> order_delivered_carrier_date order_delivered_customer_date
#> 1961 3221
#> order_estimated_delivery_date customer_unique_id
#> 0 0
#> customer_zip_code_prefix customer_lat
#> 0 305
#> customer_lng customer_city
#> 305 0
#> customer_state_code customer_state_name
#> 305 305
#> customer_region order_item_id
#> 305 767
#> product_id product_code
#> 767 767
#> product_category_name price
#> 2370 767
#> freight_value review_score
#> 767 0
#> seller_id shipping_limit_date
#> 767 767
#> seller_zip_code_prefix seller_lat
#> 767 1020
#> seller_lng seller_city
#> 1020 767
#> seller_state_code seller_state_name
#> 1020 1020
#> seller_region
#> 1020
Cukup banyak data geolocation yang kosong maka perlu kita sesuaikan. Data geolocation yang null akan dilakukan imputasi menggunakan rata-rata latitude dan longitude dari setiap customer dan pelanggan yang berada pada kota/state yang sama.
#Customer geolocation
na_cust_geo <- sqldf("select * from customers c
left join geolocation g on g.geolocation_zip_code_prefix = c.customer_zip_code_prefix")
na_zip <- na_cust_geo %>%
filter(is.na(geolocation_zip_code_prefix)) %>%
select(customer_zip_code_prefix,customer_state,customer_city)
na_geo <- geolocation
na_geo <- na_geo %>%
group_by(geolocation_state) %>%
summarise(
geolocation_lat = mean(geolocation_lat),
geolocation_lng =mean(geolocation_lng)
) %>%
ungroup()
na_geo <- sqldf("select * from na_geo geo inner join brazil_abbr abbr on geo.geolocation_state = abbr.state_code")
na_geo <- sqldf("select * from na_geo geo inner join na_zip zip on geo.geolocation_state = zip.customer_state") %>%
select(customer_zip_code_prefix,geolocation_lat, geolocation_lng, customer_city, customer_state, state_name, region) %>%
rename(geolocation_zip_code_prefix=customer_zip_code_prefix, geolocation_city=customer_city,
geolocation_state=customer_state)
geolocation <- rbind(geolocation,na_geo)
#Seller geolocation
na_seller_geo <- sqldf("select * from sellers s
left join geolocation g on g.geolocation_zip_code_prefix = s.seller_zip_code_prefix")
colSums(is.na(na_seller_geo))
#> seller_id seller_zip_code_prefix
#> 0 0
#> seller_city seller_state
#> 0 0
#> geolocation_zip_code_prefix geolocation_lat
#> 5 5
#> geolocation_lng geolocation_city
#> 5 5
#> geolocation_state state_name
#> 5 5
#> region
#> 5
na_seller_zip <- na_seller_geo %>%
filter(is.na(geolocation_zip_code_prefix)) %>%
select(seller_zip_code_prefix,seller_state,seller_city)
na_geo <- geolocation
na_geo <- na_geo %>%
group_by(geolocation_state) %>%
summarise(
geolocation_lat = mean(geolocation_lat),
geolocation_lng =mean(geolocation_lng)
) %>%
ungroup()
na_geo <- sqldf("select * from na_geo geo inner join brazil_abbr abbr on geo.geolocation_state = abbr.state_code")
na_geo <- sqldf("select * from na_geo geo inner join na_seller_zip zip on geo.geolocation_state = zip.seller_state") %>%
select(seller_zip_code_prefix,geolocation_lat, geolocation_lng, seller_city, seller_state, state_name, region) %>%
rename(geolocation_zip_code_prefix=seller_zip_code_prefix, geolocation_city=seller_city,
geolocation_state=seller_state)
geolocation <- rbind(geolocation,na_geo)
wd <- as.character(getwd())
saveRDS(object=geolocation, file=paste(paste(wd,"/data_input/",sep = ""),"geolocation.rds",sep=""))
Re-combine Data Customer Order:
#Customer Order
customer_order <- customers %>%
left_join(geolocation,by=c("customer_zip_code_prefix"="geolocation_zip_code_prefix")) %>%
select(-c(geolocation_city,customer_state)) %>%
rename(customer_lat=geolocation_lat, customer_lng=geolocation_lng,
customer_state_code=geolocation_state, customer_state_name=state_name,customer_region=region) %>%
left_join(orders, by=c("customer_id")) %>%
left_join(order_items, by=c("order_id")) %>%
left_join(products, by=c("product_id")) %>%
select(-c(product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,
product_length_cm,product_height_cm,product_width_cm)) %>%
left_join(sellers, by=c("seller_id")) %>%
left_join(geolocation, by=c("seller_zip_code_prefix"="geolocation_zip_code_prefix")) %>%
select(-c(geolocation_city,seller_state)) %>%
rename(seller_lat=geolocation_lat, seller_lng=geolocation_lng,
seller_state_code=geolocation_state, seller_state_name=state_name,seller_region=region)
order_review <- orders %>%
left_join(order_reviews,by=c("order_id")) %>%
group_by(order_id) %>%
summarise(
review_score = round(mean(review_score))
) %>%
ungroup()
customer_order <- customer_order %>% left_join(order_review, by=c("order_id")) %>%
mutate(
price = ifelse(is.na(price),0,price),
freight_value = ifelse(is.na(freight_value),0,freight_value)
)
customer_order <- customer_order %>%
select(
order_id, order_purchase_timestamp,order_status,
order_item_id,product_id, product_code, product_category_name, price, freight_value, product_rating=review_score,
customer_unique_id,customer_lat,customer_lng,customer_city, customer_state_name, customer_region) %>%
arrange(order_purchase_timestamp,order_id) %>%
filter(!(!order_status %in% c("canceled","unavailable") & is.na(product_id))) %>%
distinct()
# customer_order <- customer_order %>%
# select(
# order_id, order_purchase_timestamp,order_status,
# order_item_id,product_id, product_code, product_category_name, price, freight_value, product_rating=review_score,
# customer_unique_id,customer_lat,customer_lng,customer_city, customer_state_name, customer_region) %>%
# arrange(order_purchase_timestamp,order_id)
# Value adjustment based on external resources
customer_order[customer_order$customer_unique_id=="507dc9becd4fc65635d90682dfa9d3a3",]$customer_lat <- -27.85576
customer_order[customer_order$customer_unique_id=="507dc9becd4fc65635d90682dfa9d3a3",]$customer_lng <- -54.44738
lat <- customer_order[customer_order$customer_unique_id=="507dc9becd4fc65635d90682dfa9d3a3",]$customer_lat
lng <- customer_order[customer_order$customer_unique_id=="507dc9becd4fc65635d90682dfa9d3a3",]$customer_lng
customer_order[customer_order$customer_unique_id=="e32298294895b7f92d89754e744e930c",]$customer_lat <- lat[1]-0.0001
customer_order[customer_order$customer_unique_id=="e32298294895b7f92d89754e744e930c",]$customer_lng <- lng[1]-0.0002
customer_order[customer_order$customer_unique_id=="ed8bf8f73a4c6a4d1e8b519c03c1c17b",]$customer_lat <- lat[1]-0.0002
customer_order[customer_order$customer_unique_id=="ed8bf8f73a4c6a4d1e8b519c03c1c17b",]$customer_lng <- lng[1]-0.0004
customer_order[customer_order$customer_unique_id=="8ad15fb1cf056731335e36eb723c0eac",]$customer_lat <- lat[1]-0.0004
customer_order[customer_order$customer_unique_id=="8ad15fb1cf056731335e36eb723c0eac",]$customer_lng <- lng[1]-0.0008
customer_order[customer_order$customer_unique_id=="d32e5ab26ec56747954dab0844e8be2e",]$customer_lat <- lat[1]-0.0001
customer_order[customer_order$customer_unique_id=="d32e5ab26ec56747954dab0844e8be2e",]$customer_lng <- lng[1]-0.0008
customer_order[customer_order$customer_unique_id=="f46ebdbd5c48c0683980a759b8937912",]$customer_lat <- lat[1]-0.0008
customer_order[customer_order$customer_unique_id=="f46ebdbd5c48c0683980a759b8937912",]$customer_lng <- lng[1]-0.0002
customer_order[customer_order$customer_unique_id=="dc02867c21f42ed442f02e434234e3bb",]$customer_lat <- lat[1]-0.0002
customer_order[customer_order$customer_unique_id=="dc02867c21f42ed442f02e434234e3bb",]$customer_lng <- lng[1]-0.0008
customer_order[customer_order$customer_unique_id=="23a886ebce4bc86212c72f669a9bdb31",]$customer_lat <- lat[1]-0.0008
customer_order[customer_order$customer_unique_id=="23a886ebce4bc86212c72f669a9bdb31",]$customer_lng <- lng[1]-0.0008
customer_order[customer_order$customer_unique_id=="fa58e11c149d7e903dbe5f6c854277e7",]$customer_lat <- lat[1]-0.0003
customer_order[customer_order$customer_unique_id=="fa58e11c149d7e903dbe5f6c854277e7",]$customer_lng <- lng[1]-0.0006
customer_order[customer_order$customer_unique_id=="39625643b498b08ebf941f9170277974",]$customer_lat <- lat[1]-0.0003
customer_order[customer_order$customer_unique_id=="39625643b498b08ebf941f9170277974",]$customer_lng <- lng[1]-0.0007
customer_order[customer_order$customer_unique_id=="ea0291d9b2223d0dc3f8d5b48ae2574f",]$customer_state_name <- "Rio Grande do Sul"
customer_order[customer_order$customer_unique_id=="decf5e95920bb2dc3fe659c102dbbf70",]$customer_lat <- -29.72267
customer_order[customer_order$customer_unique_id=="decf5e95920bb2dc3fe659c102dbbf70",]$customer_lng <- -53.82436
customer_order[customer_order$customer_unique_id=="6740f8899f3c70b5b08b2e0bad37e567",]$customer_lat <- -25.530555
customer_order[customer_order$customer_unique_id=="6740f8899f3c70b5b08b2e0bad37e567",]$customer_lng <- -48.503208
customer_order[customer_order$customer_unique_id=="9e68775971541ca8a79b6fce848e7aee",]$customer_lat <- -1.466900
customer_order[customer_order$customer_unique_id=="9e68775971541ca8a79b6fce848e7aee",]$customer_lng <- -56.379157
customer_order[customer_order$customer_unique_id=="2a4b1192846ec238d62df3838257bad9",]$customer_lat <- -1.467120
customer_order[customer_order$customer_unique_id=="2a4b1192846ec238d62df3838257bad9",]$customer_lng <- -56.378953
customer_order[customer_order$customer_unique_id=="ba1f863dca8e24fc845fda7e6b9e98d0",]$customer_lat <- -25.866092
customer_order[customer_order$customer_unique_id=="ba1f863dca8e24fc845fda7e6b9e98d0",]$customer_lng <- -49.365795
customer_order[customer_order$customer_unique_id=="67045ba793233569bed80ccf42e02070",]$customer_lat <- -25.846092
customer_order[customer_order$customer_unique_id=="67045ba793233569bed80ccf42e02070",]$customer_lng <- -49.385795
customer_order[customer_order$customer_unique_id=="66596e08440320718dd23ea4d89ed37f",]$customer_lat <- -25.826092
customer_order[customer_order$customer_unique_id=="66596e08440320718dd23ea4d89ed37f",]$customer_lng <- -49.315795
customer_order[customer_order$customer_unique_id=="fb98dba63ddb41a7f253e8f590e80107",]$customer_lat <- -22.27874
customer_order[customer_order$customer_unique_id=="fb98dba63ddb41a7f253e8f590e80107",]$customer_lng <- -42.53641
Missing value after re-combine
#> order_id order_purchase_timestamp order_status
#> 0 0 0
#> order_item_id product_id product_code
#> 767 767 767
#> product_category_name price freight_value
#> 2370 0 0
#> product_rating customer_unique_id customer_lat
#> 0 0 0
#> customer_lng customer_city customer_state_name
#> 0 0 0
#> customer_region
#> 0
Dapat dilihat masih terdapat missing value pada data order_item_id, product_id,price, geolocation seller, hal ini terjadi pada order yang memiliki status canceled dan unavailable, sehingga tidak masalah karena menandakan bahwa customer tidak jadi melakukan order. Kemudian terdapat missing value juga pada data order_approved_at , order_delivered_carrier_date dan order_delivered_customer_date, namun tidak menjadi masalah karena sementara ini variabel tersebut tidak menjadi fokus di projek ini. Berikut 10 data Customer Order teratas:
wd <- as.character(getwd())
saveRDS(object=customer_order, file=paste(paste(wd,"/data_clean/",sep = ""),"customer_order.rds",sep=""))
#saveRDS(object=customer_order, file=paste(paste(wd,"/data_clean/",sep = ""),"customer_order.rds",sep=""))
head(customer_order,10)
Berikut ini data pembayaran per-setiap order yang merupakan hasil merge/join dari data customer, order dan order_payment. Perlu diketahui bahwa Olist mendukung fitur sistem pembayaran kredit dan juga multi-payment. Berikut struktur data dan pengecekan missing value:
# customer_order
customer_payment <- customer_order %>%
select(customer_unique_id,order_id) %>%
distinct() %>%
left_join(order_payments, by=c("order_id")) %>%
select(customer_unique_id,order_id,payment_type,payment_value,payment_sequential) %>%
spread(payment_type,payment_value) %>%
replace(is.na(.), 0) %>%
group_by(order_id) %>%
summarise(
payment_sequential = length(payment_sequential),
boleto = sum(boleto),
credit_card = sum(credit_card),
debit_card = sum(debit_card),
not_defined = sum(not_defined),
voucher = sum(voucher)
) %>%
ungroup() %>%
mutate(
total_payment = boleto+credit_card+debit_card+not_defined+voucher
)
Data Structure:
#> Observations: 99,433
#> Variables: 8
#> $ order_id <chr> "00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f...
#> $ payment_sequential <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
#> $ boleto <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 34.59, 0.00, 0.00,...
#> $ credit_card <dbl> 72.19, 259.83, 216.87, 25.78, 218.04, 0.00, 31.7...
#> $ debit_card <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
#> $ not_defined <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
#> $ voucher <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
#> $ total_payment <dbl> 72.19, 259.83, 216.87, 25.78, 218.04, 34.59, 31....
Missing Value:
#> order_id payment_sequential boleto credit_card
#> 0 0 0 0
#> debit_card not_defined voucher total_payment
#> 0 0 0 0
Informasi diatas menunjukan tidak terdapat missing value dari data Order Payment. Berikut 10 data Order Payment teratas:
#eval=TRUE
wd <- as.character(getwd())
saveRDS(object=customer_payment, file=paste(paste(wd,"/data_clean/",sep = ""),"customer_payment.rds",sep=""))
head(customer_payment,10)
customer_order <- readRDS("data_clean/customer_order.rds")
customer_payment <- readRDS("data_clean/customer_payment.rds")
plot_order_status <- customer_order %>%
mutate(
year = as.factor(year(order_purchase_timestamp)),
ym = as.yearmon(order_purchase_timestamp),
yearmonth = format(order_purchase_timestamp, format='%Y-%m-1'),
yearmonth = ymd(yearmonth)
) %>%
select(order_id,order_status,year,ym, yearmonth) %>%
distinct() %>%
group_by(year,ym, yearmonth,order_status) %>%
summarise(
freq = n()
) %>%
ungroup() %>%
arrange(yearmonth) %>%
mutate(
popup = glue("Yearmon : {ym}
Frequency : {freq}
Status: {order_status}"),
order_status = reorder(order_status, freq)
) %>%
ggplot(aes(month(yearmonth,label = TRUE), log(freq), color=order_status, fill=order_status, group=order_status))+
geom_bar(aes(text=popup),stat="identity",
position = position_dodge2(padding = 0.4, width = 0.9), show.legend = FALSE) +
#scale_y_continuous(expand = c(0, 0), limits = c(0, 10)) +
labs(
title = "Total Order by status",
x="",
y="Frequency",
fill="Order Status"
)+
facet_wrap(~year, ncol=1, scales = "free_y")+
my_theme_color()+
my_theme_fill()+
my_plot_theme(9)+
theme(
axis.text.y = element_blank(),
legend.position = "bottom",
legend.margin = margin(-0.5,0,0,0, unit="cm")
)
ggplotly(plot_order_status, tooltip = "text", height = 480) %>%
layout(margin = list(l = 50, r = 30, b = 50, t = 90)) %>%
layout(
legend=list(orientation = "h",
y = -0.1, x = 0.0)
)%>%
layout(title = list(text = paste0('Total Order by Status',
'<br>',
'<sup>',
'This visualization using log(frequency)',
'</sup>')))
Data divisualisasikan menggunakan Log(Order Frequency) supaya frekuensi minimun tetap dapat terlihat jelas. Perlu diketahui, data Order ini memiliki beberapa status antara lain:
1. Processing : Status setelah customer melakukan order.
2. Unavailable : Satatus ketika penjual menyatakan stock produk tidak tersedia setelah customer melakukan order.
3. Canceled : Status ketika customer membatalkan order.
4. Approved : Status ketika order disetujui oleh penjual.
5. Invoiced : Status ketika order sudah dibayar oleh customer.
6. Shipped : Status ketika produk sedang dikirimkan oleh penjual ke customer.
7. Delivered : status ketika produk sudah sampai pada customer.
Chart diatas menunjukan jumlah order dengan status Delivered jauh lebih banyak dari yang lainnya pada setiap bulan dari 2016-2018 kecuali pada bulan September-Oktober 2018. Jika dilihat pada bulan September-Oktober 2018 hanya terdapat 1 order berstatus Shipped dan 19 order berstatus Cancelled dan tidak ada order yang berstatus Delivered, sehingga membutikan belum penjualan yang berhasil, kecuali 1 order Shipped tersebut. Mari kita lihat data ordernya.
plot_order_freq <- customer_order %>%
#filter(!order_status %in% c("canceled","unavailable")) %>%
mutate(
year = as.factor(year(order_purchase_timestamp)),
ym = as.yearmon(order_purchase_timestamp),
yearmonth = format(order_purchase_timestamp, format='%Y-%m-1'),
yearmonth = ymd(yearmonth)
) %>%
select(order_id,year,ym, yearmonth) %>%
distinct() %>%
group_by(year,ym, yearmonth) %>%
summarise(freq = n()) %>%
ungroup() %>%
mutate(
normalisasi = (freq-min(freq))/(max(freq)-min(freq)),
popup=glue("Year-Month : {ym}
Frequency : {freq}")) %>%
arrange(yearmonth) %>%
ggplot(aes(x=yearmonth, y=normalisasi),text=popup)+
geom_area(fill=my_theme_hex("col2"),alpha=0.7)+
geom_line(size=0.7,color="white") +
geom_point(color="white", size = 2, alpha = 0.9,aes(text=popup))+
labs(
title="Growth of Order Frequency per Month",
x="Month",
y="Frequency"
)+
scale_x_date(breaks=date_breaks('1 months'),
labels=date_format('%b %y'))+
my_theme_color()+
my_plot_theme(10)+
theme(
axis.text.x = element_text(angle = 90),
legend.position = "none"
)
ggplotly(plot_order_freq, tooltip = "text") %>%
layout(title = list(text = paste0('Growth of Order Frequency per Month',
'<br>',
'<sup>',
'Monthly order frequency from September 2016 - October 2018',
'</sup>')))
Chart diatas menunjukan frekuensi order pada setiap bulannya dari September 2016 - Oktober 2018. Jika dilihat dari frekuensi order tahun 2016 sangat kecil, namun relatif stabil dan meningkat dari januari 2017 - Agustus 2018. Kemudian terjadi penurunan frekuensi order sangat tinggi pada bulan September-Oktober 2018 dari 7308 order menurun hinga 20 order. Berdasarkan visualisasi status order sebelumnya, 20 order pada bulan September-Oktober 2018 terdiri dari 19 order Cancelled dan 1 order shipped. Produk yang dijual pada Olist terdiri dari berbagai macam jenis dan berbagai macam harga, mari kita lihat order amount nya.
plot_order_freq <- customer_order %>%
#filter(!order_status %in% c("canceled","unavailable")) %>%
mutate(
year = as.factor(year(order_purchase_timestamp)),
ym = as.yearmon(order_purchase_timestamp),
yearmonth = format(order_purchase_timestamp, format='%Y-%m-1'),
yearmonth = ymd(yearmonth)
) %>%
group_by(year,ym, yearmonth) %>%
summarise(total_amount = sum(price+freight_value)) %>%
ungroup() %>%
mutate(popup=glue("Year-Month : {ym}
Total Amount : {total_amount}")) %>%
arrange(yearmonth) %>%
ggplot(aes(x=yearmonth, y=total_amount), text=popup)+
geom_area(fill=my_theme_hex("col3"),alpha=0.7)+
geom_line(size=0.7,color="white") +
geom_point(color="white", size = 2, alpha = 0.9, aes(text=popup))+
labs(
title="Growth of Order Amount per Month",
x="Month",
y="Order Amount"
)+
scale_x_date(breaks=date_breaks('1 months'),
labels=date_format('%b %y'))+
my_theme_color()+
my_plot_theme(10)+
theme(
axis.text.x = element_text(angle = 90),
legend.position = "none"
)
ggplotly(plot_order_freq, tooltip = "text") %>%
layout(title = list(text = paste0('Growth of Order Amount per Month',
'<br>',
'<sup>',
'Monthly order amount from September 2016 - October 2018',
'</sup>')))
Terkait penurunan order pada bulan september-Oktober 2018, bisa dilihat order amount bulan september 2018 hanya R$ 166.46 yang merupakan amount dari 1 order yang berstatus Shipped dan karena pada OKtober 2018 seluruh order bersatus Cancelled maka total amount-nya nol(0). Namun secara keseluruhan, dapat dilihat pola visualisasi order amount dan frekuensi order cukup sama. Hal ini menunjukan, frekuensi Order sangat berpengaruh untuk menentukan jumlah order amount, dalam artian hampir dipastikan setiap order memiliki amount disekitar rata-rata, kecuali bulan Agustus 2018, Pada chart frekuensi order menunjukan bulan Agustus 2018 terdapat kenaikan frekuensi order namun pada chart total order amount justru mengalami penurunan. Mengapa?
customer_order %>%
filter(year(order_purchase_timestamp)=="2018", month(order_purchase_timestamp) %in% c("6","7","8")) %>%
group_by(order_id,month=month(order_purchase_timestamp)) %>%
summarise(
order_amount = sum(price+freight_value)
) %>%
ungroup() %>%
mutate(
year = 2018
) %>%
group_by(year,month) %>%
summarise(
freq_order = n(),
mean_order_amount=mean(order_amount)
)
Data diatas menunjukan pada bulan Agustus 2018, rata-rata order amount lebih rendah sehingga mengakibatkan total order amount menjadi lebih rendah. Meskipun begitu, secara keseluruhan frekuensi order sangat berpengaruh terhadap total order amount sehingga kita bisa mengabaikan efek rata-rata order amount perbulan. Berdasarkan case ini dapat disimpulkan bahwa jumlah order amount dapat meningkat jika frekuensi order bertambah, sehingga muncul pertanyaan Bagaimana cara meningkatkan frekuensi order? Tentunya dengan mencari customer baru untuk melakukan order, namun bisa juga apabila jika fokus meningkatkan frekuensi order daripada customer yang sudah ada, sehingga pertanyaannya berubah menjadi Bagaimana meningkatkan frekuensi order daripada customer yang ada?. Oke sebelum sampai kesitu mari kita cek seberapa besar seorang customer melakukan pembelian berulang.
cust_freq_order <- customer_order %>%
select(customer_unique_id,order_id) %>%
distinct() %>%
group_by(customer_unique_id) %>%
summarise(freq = n()) %>%
ungroup() %>%
group_by(freq) %>%
summarise(
count_cust = n()
) %>%
ungroup() %>%
mutate(
normalisasi = (count_cust-min(count_cust))/(max(count_cust)-min(count_cust))+0.1,
popup = glue("Repeat Order: {freq}
Count Cust: {count_cust}")
)
plot_cust_freq_order <- ggplot(cust_freq_order,aes(freq,normalisasi))+
geom_bar(aes(fill=freq,text=popup), stat = "identity", position = position_dodge2(padding = 2))+
geom_text(aes(label=count_cust, y=normalisasi+0.05),size=3, color="white")+
labs(
title="Total Customer based on Repeat Order",
x = "Frequency of Repeat Order",
y=""
)+
scale_x_continuous(limits = c(0,(max(cust_freq_order$freq)+2)),breaks = c(0:(max(cust_freq_order$freq)+2)))+
scale_fill_gradient(low=my_theme_hex("col1"),high=my_theme_hex("col5"))+
my_plot_theme(10)+
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
ggplotly(plot_cust_freq_order, tooltip = "text") %>%
layout(title = list(text = paste0('Total Customer by Repeat Order',
'<br>',
'<sup>',
'Data using Order frequency of each Customer from 2016-09-04 untill 2018-10-17 ',
'</sup>')))
Repeat order paling banyak terjadi 17 kali dan hanya dilakukan oleh 1 orang customer. Selain itu, 97% Customer hanya melakukan 1 kali order. Dalam case ini saya berasumsi bahwa Olist kurang berupaya untuk meningkatkan frekuensi order dari customer yang ada, cukup disayangkan mengingat jumlah customer yang ada tidak sedikit. Terkait chart diatas, maka seharusnya frekuensi order berbanding lurus dengan pertumbuhan customer baru yang melakukan order. Mari kita cek.
plot_new_cust <- customer_order %>%
#filter(!order_status %in% c("canceled","unavailable")) %>%
mutate(
year = as.factor(year(order_purchase_timestamp)),
ym = as.yearmon(order_purchase_timestamp),
yearmonth = format(order_purchase_timestamp, format='%Y-%m-1'),
yearmonth = ymd(yearmonth)
) %>%
group_by(customer_unique_id,year,ym, yearmonth) %>%
summarise(
first_order = min(order_purchase_timestamp)
) %>%
ungroup() %>%
group_by(year,ym, yearmonth) %>%
summarise(freq = n()) %>%
ungroup() %>%
mutate(
popup=glue("Year-mon: {ym}
{freq} new customers")
) %>%
arrange(yearmonth) %>%
ggplot(aes(yearmonth,freq))+
geom_area(fill=my_theme_hex("col5"),alpha=0.7)+
geom_line(size=0.7,color="white") +
geom_point(color="white", size = 2, alpha = 0.9, aes(text=popup))+
labs(
title="Growth of New Customer by Order per Month",
x="Year-Month",
y="Freq New Customer"
)+
scale_x_date(breaks=date_breaks('1 months'),
labels=date_format('%b %y'))+
my_theme_color()+
my_plot_theme(10)+
theme(
axis.text.x = element_text(angle = 90),
legend.position = "none"
)
ggplotly(plot_new_cust, tooltip = "text") %>%
layout(title = list(text = paste0('Growth of New Customer by Order per Month',
'<br>',
'<sup>',
'Exclude order with status in Cancelled & Unavailable',
'</sup>')))
Bisa dilihat pola visualisasi chart diatas relatif sama dengan pola visualisasi frekuensi order dan pola visualisasi total order amount, maka seharusnya jumlah customer akan sangat mempengaruhi total order amount yang ada, berikut pengecekannya:
plot_freq_amount <- customer_order %>%
#filter(!order_status %in% c("canceled","unavailable")) %>%
group_by(customer_unique_id,customer_state_name,year=as.factor(year(order_purchase_timestamp))) %>%
summarise(
total = sum(price+freight_value)
) %>%
ungroup() %>%
distinct() %>%
group_by(customer_state_name,year = year) %>%
summarise(
total = sum(total),
freq = n()
) %>%
ungroup() %>%
mutate(
popup = glue("State : {customer_state_name}
Total Customer : {freq}
Total Order Amount : {total}")
) %>%
ggplot(aes(freq,total), text=popup, group = customer_state_name)+
geom_smooth(method = "lm", color=my_theme_hex("col2"), alpha=0.5)+
geom_point(aes(color=customer_state_name, text=popup),show.legend = FALSE, size=3, alpha=0.7)+
labs(
title = "Impact of Total Customer on Order Amount",
y="",
x="Total Customer"
)+
facet_wrap(~year, scales = "free", ncol = 1)+
my_theme_color()+
my_plot_theme(11)+
theme(
panel.spacing = unit(1.2,"lines")
)
ggplotly(plot_freq_amount, tooltip="text", height = 580) %>%
layout(showlegend=FALSE,
margin = list(l = 50, r = 30, b = 50, t = 90)) %>%
layout(title = list(text = paste0('Customer VS Order Amount',
'<br>',
'<sup>',
'Impact of Total Customer on Order Amount from 2016-09-04 untill 2018-10-17',
'</sup>')))
Chart diatas memperjelas bahwa semakin meningkat jumlah customer maka semakin meningkat total order amount. Sangat baik Olist selalu mampu mendapatkan customer baru, meskipun begitu dapat dilihat karena pada September-Oktober 2018 jumlah customer baru hanya 18 orang sehingga menyebabkan total order amount menurun drastis. Saya berpendapat bahwa akan sangat baik apabila Olist mempertimbangkan customer yang sudah ada untuk meningkatkan frekuensi order dan total order amount. Mari kita lihat total order amount dan total customer pada setiap negara bagian di Brazil.
plot_best_state <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
group_by(customer_state_name) %>%
summarise(
total = sum(price+freight_value)
) %>%
ungroup() %>%
mutate(
customer_state_name = reorder(customer_state_name,total),
popup = glue("State: {customer_state_name}
Total Monetary : {total}")
) %>%
ggplot(aes(customer_state_name,(total/sum(total))*100, text=popup)) +
geom_bar(aes(fill=customer_state_name), stat = "identity", show.legend = FALSE)+
geom_text(aes(label=paste0(round((total/sum(total))*100,1),"%"),
y=(total/sum(total))*100+1.5), vjust = -0.5, size=2.5, color="white") +
labs(
title = "Total Order Amount by State in Brazil",
x="",
y="Total Order Amount (%)",
fill="Year"
)+
coord_flip()+
my_theme_fill()+
my_plot_theme(10)
ggplotly(plot_best_state, tooltip="text") %>%
layout(showlegend=FALSE) %>%
layout(title = list(text = paste0('Total Order Amount by State',
'<br>',
'<sup>',
'Total Order Amount by State from 2016-09-04 untill 2018-10-17',
'</sup>')))
Sao Paulo memiliki total order amount paling tinggi dengan proporsi 37.4% dari total order amount keseluruhan dan jauh lebih tinggi dari negara bagian lainnya. Mari kita cek jumlah customernya.
plot_total_customer <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
select(customer_unique_id,customer_state_name) %>%
distinct() %>%
group_by(customer_state_name) %>%
summarise(
freq = n()
) %>%
ungroup() %>%
mutate(
customer_state_name = reorder(customer_state_name,freq),
popup = glue("State: {customer_state_name}
Total Customer : {freq}")
) %>%
ggplot(aes(customer_state_name,(freq/sum(freq))*100, text=popup)) +
geom_bar(aes(fill=customer_state_name), stat = "identity", show.legend = FALSE)+
geom_text(aes(label=paste0(round((freq/sum(freq))*100,2),"%"),
y=(freq/sum(freq))*100+2), hjust = 0.2, size=2.5, color="white") +
labs(
title = "Total Customer by State",
x="",
y="Total Customer (%)",
fill="Year"
)+
coord_flip()+
my_theme_fill()+
my_plot_theme(10,"Segoe UI Semibold")
ggplotly(plot_total_customer, tooltip="text") %>%
layout(showlegend=FALSE) %>%
layout(title = list(text = paste0('Total Customer by State',
'<br>',
'<sup>',
'Total Customer by State from 2016-09-04 untill 2018-10-17',
'</sup>')))
Total customer paling banyak di Sao Paulo dengan proporsi 41.8% dari jumlah keseluruhan customer dan juga jauh lebih tinggi dari negara bagian lainnya. Hal ini masuk akal, mengingat jumlah customer sangat mempengaruhi total order amount. Dalam case ini, dapat dipastikan Sao Paulo adalah negara bagian yang paling memiliki potensi untuk meningkatkan conversion rate dan customer retention.
–visualisasi map terlalu berat.
#EVAL FALSE
#Brazil<-map_data("world")%>%filter(region=="Brazil")
geo_customer_order <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
select(customer_unique_id,customer_state_name,customer_lat,customer_lng,customer_state_name) %>%
distinct() %>%
mutate(
popup = glue("Cust ID: {customer_unique_id}
State: {customer_state_name}")
)
# plot_geo_customer_order <- ggplot() +
# geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="#181818")+
# geom_point(data= geo_customer_order,aes(x=customer_lng,y=customer_lat, color=customer_state_name, text=popup),
# size=0.3, alpha=0.5)+
# labs(
# title = "Customer by State",
# x="Longitude",
# y="Latitude",
# color="State Name"
# )+
# my_theme_fill()+
# my_plot_theme(10)
#
#
# ggplotly(plot_geo_customer_order, tooltip="text", height=560) %>%
# #layout(showlegend=FALSE) %>%
# layout(title = list(text = paste0('Customer by State',
# '<br>',
# '<sup>',
# 'Customer by State based on order from 2016-09-04 untill 2018-10-17',
# '</sup>')))
labels <- sprintf(
"ID: <strong>%s</strong><br/>State Name: <strong>%s</strong>",
geo_customer_order$customer_unique_id,
geo_customer_order$customer_state_name
) %>% lapply(htmltools::HTML)
geo_customer_order %>% leaflet() %>%
setView(-55.69142,-17.95,4) %>%
addTiles() %>%
#addProviderTiles(providers$CartoDB.DarkMatter) %>%
#addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
#addProviderTiles(provider = providers$HERE.mapLabels) %>%
#addProviderTiles(providers$Stamen.TonerLines) %>%
clearShapes() %>%
addMarkers(
clusterOptions = markerClusterOptions(),
lng = ~ customer_lng,
lat = ~ customer_lat,
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")
)
# addCircles( lng = ~ customer_lng,
# lat = ~ customer_lat,
# label = labels,
# popup = labels,
# labelOptions = labelOptions(
# style = list("font-weight" = "normal", padding = "3px 8px"),
# textsize = "15px",
# direction = "auto"),
# #weight = 3,
# #radius=40,
# color= ~cof(recency_score), stroke = TRUE, fillOpacity = 0.5) %>%
# addLegend("bottomright", pal = cof, values = ~recency_score,
# title = "Recency Score",
# opacity = 1
# )
plot_best_category <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
group_by(product_category_name,year=year(order_purchase_timestamp)) %>%
summarise(
freq = n()
) %>%
arrange(year,desc(freq)) %>%
group_by(year) %>%
slice(1:10) %>%
ungroup() %>%
mutate(
product_category_name = gsub("_"," ",product_category_name),
popup = glue("Year : {2016}
Product Category : {product_category_name}
Ordered : {freq}"
),
#product_category_name = str_sub(product_category_name,1,10),
product_category_name = reorder(product_category_name, freq)
) %>%
#filter(year==2017) %>%
ggplot(aes(x=product_category_name,y=as.integer(freq), group=year, fill=product_category_name)) +
geom_bar(stat = "identity", show.legend = FALSE, aes(text=popup))+
labs(
title = "Top 10 Product Category per Year",
y="Order Quantity",
x=""
)+
coord_flip()+
facet_grid(~year, scales = "free")+
my_theme_fill()+
my_plot_theme(11)
ggplotly(plot_best_category, tooltip="text") %>%
layout(showlegend=FALSE,
margin = list(l = 50, r = 30, b = 50, t = 90)) %>%
layout(title = list(text = paste0('Top 10 Product Category per Year',
'<br>',
'<sup>',
'Best Product based on Order Quantity from 2016-09-04 untill 2018-10-17',
'</sup>')))
Chart diatas memberikan informasi 10 kategori produk yang paling banyak diorder dari tahun 2016-2018. Jika dilihat kategori produk yang konsisten masuk kategori top 10 product category dari 2016-2018 adalah Health Beauty, Sports Leisure, Funiture Decor, Computers Accessories dan Housewares. Jika dilihat secara keseluruhan, kategori produk Health Beauty menjadi tren produk nomor 1 yang diorder, namun jika secara quantity produk kategori bed bath table yang paling banyak di-order meskipun produk kategori bed bath table tidak masuk top 10 product category di tahun 2016. Berdasarkan case ini jika kita hendak merekomendasikan berdasarkan tren maka kategori produk Health Beauty jelas punya pasar setiap tahunnya, namun jika kita hendak merekomendasikan berdasarkan jumlah order maka produk kategori bed bath table karena jumlah ordernya paling tinggi dan berkelanjutan dari 2017-2018.
plot_best_product <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
group_by(product_id,product_code,product_category_name,year=year(order_purchase_timestamp)) %>%
summarise(
freq = n()
) %>%
arrange(year,desc(freq)) %>%
group_by(year) %>%
slice(1:10) %>%
ungroup() %>%
mutate(
product_code = reorder(product_code, freq),
popup = glue("Year : {year}
Product Code: {product_code}
Product Category : {product_category_name}
Ordered : {freq}"
)
) %>%
ggplot(aes(x=product_code,y=as.integer(freq), group=year, fill=product_code)) +
geom_bar(stat = "identity", show.legend = FALSE, aes(text=popup))+
labs(
title = "Top 10 Product per Year",
y="Order Quantity",
x=""
)+
coord_flip()+
facet_grid(~year, scales = "free")+
my_theme_fill()+
my_plot_theme(11)+
theme(
panel.spacing = unit(1.2,"lines")
)
ggplotly(plot_best_product, tooltip="text", height = 600) %>%
layout(showlegend=FALSE,
margin = list(l = 50, r = 30, b = 50, t = 90)) %>%
layout(title = list(text = paste0('Top 10 Product by Order per Year',
'<br>',
'<sup>',
'Best Product based on Order Quantity from 2016-09-04 untill 2018-10-17',
'</sup>')))
Chart diatas memberikan informasi 10 produk yang paling banyak di-order dan menunjukan tren produk di-order dari 2016-2018. Jika dilihat 10 produk yang paling banyak dibeli ditahun 2016 tidak masuk ke top 10 product lagi ditahun 2017 dan 2018. Jika dilihat produk PR14052, PR30294, PR04599 masuk kategori top 10 product pada tahun 2017 dan 2018 sehingga kemungkinan produk tersebut masuk tren ditahun selanjutnya lebih besar. Namun jika dilihat secara keseluruhan, tren Top 10 Product produk terjual setiap tahunnya relatif berbeda sehingga akan sulit memastikan rekomendasi produk yang spesifik ke setiap customer setiap tahunnya.
plot_best_rating <- customer_order %>%
filter(!order_status %in% c("canceled","unavailable")) %>%
group_by(product_id,product_code,product_category_name,year=year(order_purchase_timestamp)) %>%
summarise(
total_rating = sum(product_rating)
) %>%
arrange(year,desc(total_rating)) %>%
group_by(year) %>%
slice(1:10) %>%
ungroup() %>%
mutate(
product_code = reorder(product_code, total_rating),
popup = glue("Year : {year}
Product Code: {product_code}
Product Category : {product_category_name}
Total Rating : {total_rating}"
)
) %>%
ggplot(aes(x=product_code,y=as.integer(total_rating), group=year, fill=product_code)) +
geom_bar(stat = "identity", show.legend = FALSE, aes(text=popup))+
labs(
title = "Top 10 Product per Year",
y="Total Product Rating",
x=""
)+
coord_flip()+
facet_grid(~year, scales = "free")+
my_theme_fill()+
my_plot_theme(11)+
theme(
panel.spacing = unit(1.2,"lines")
)
ggplotly(plot_best_rating, tooltip="text", height = 600) %>%
layout(showlegend=FALSE,
margin = list(l = 50, r = 30, b = 50, t = 90)) %>%
layout(title = list(text = paste0('Top 10 Product by Rating per Year',
'<br>',
'<sup>',
'Best Product based on Rating from 2016-09-04 untill 2018-10-17',
'</sup>')))
Data Rating diatas diambil berdasarkan total rating yang didapatkan dari setiap produk. Masing-masing produk diberikan rating 1 - sampai 5 setiap kali ketika customer selesai melakukan order. Alasanya menggunakan jumlah rating karena tidak semua produk memiliki total order quantity yang sama. Chart diatas juga menunjukan tren produk yang berbeda setiap tahunnya. Rating sendiri bisa kita gunakan untuk menentukan rekomendasi produk kepada customer dengan mencari tingkat similarity dari masing-masing produk, namun tantangannya 97% customer hanya melakukan 1 kali order yang dimana setiap order hanya terdiri dari 1 produk. Berdasarkan case ini mungkin kita bisa mempertimbangkan variabel lain yang digunakan untuk menentukan rekomendasi produk secara personal.
Olist sangat baik dalam mendatangkan customer baru. Hal ini dibuktikan 97% transaksi dihasilkan oleh customer baru. Namun mengingat hanya 3% Customer yang melakukan repeat order dari September 2016-Oktober 2018, saya berasumsi bahwa Olist kurang memperhatikan customer lama atau customer yang sudah melakukan order sebelumnya. Melihat data order pada September-Oktober 2018, jumlah customer baru hanya 18 orang sehingga mengakibatkan penurunan frekuensi order yang drastis dari bulan sebelumnya. Hal ini menunjukan frekuensi order pada Olist sangat bergantung pada pertumbuhan customer baru, sehingga saya menyarankan supaya Olist juga dapat mempertimbangkan untuk meningkatkan frekuensi order pada customer yang sudah ada. Jika berdasarkan visualisasi diatas, langkah awal yang dapat diambil yaitu memberikan promosi dan rekomendasi produk-produk terlaris dan sedang tren kepada customer yang berada di negara bagian Sao Paulo, Rio de Janeiro, Minas Gerais, Rio Grande do Sul dan Parana karena 77% customer terdapat pada negara bagian tersebut. Supaya strategi promosi dan rekomendasi tersebut lebih optimal, saya juga menyarankan Olist untuk melakukan segmentasi terhadap customer yang sudah ada supaya target pasarnya bisa lebih tepat dan tentunya akan lebih menghemat waktu dan biaya pemasaran.
Berdasarkan hasil analisa diatas, maka projek ini bertujuan untuk membangun model segmentasi customer dan model rekomendasi produk terhadap customer yang berada di negara bagian Sao Paulo, Rio de Janeiro, Minas Gerais, Rio Grande do Sul dan Parana. Model segmentasi customer akan diterapkan berdasarkan RFM Value karena sebagian besar customer hanya melakukan 1 kali order dan model rekomendasi akan diterapkan berdasarkan produk yang di-suka/order oleh customer. Oleh karena itu, data input yang digunakan untuk membuat model tersebut harus memiliki struktur sebagai berikut:
Data input akan dikelola dan memiliki RFM Value. Metode analisis Recency, Frequency, Monetary Value (RFM) adalah salah satu metode analisis perilaku pelanggan berdasarkan histori transaksinya. Output dari analisis RFM dapat digunakan untuk melakukan segmentasi pelanggan. Metode analisis RFM dapat menjelaskan:
Sesuai artikel yang dipublish oleh www.marketeers, GO-JEK adalah salah satu perusahaan yang menggunakan metode analisis RFM dalam menentukan segmentasi pelanggan. GO-JEK membagi segmentasi pelanggan ke dalam empat kelas, yaitu Gold, Silver, Bronze, dan Non-Profit. Segmentasi Gold memiliki kualifikasi konsumen dengan high monetary, high frequency, dan high recency. Silver memiliki kualifikasi konsumen dengan tingkat monetary yang tinggi, frequency yang rendah, dan recency yang tinggi. Kategori Bronze terdiri dari konsumen dengan tingkat monetary rendah, frequency, dan recency yang tinggi. Sementara segmentasi Non-Profit memiliki kualifikasi konsumen dengan kualifikasi monetary, frequency, dan recency yang rendah.
Dalam case Olist ini, dataset berisikan data transaksi dari 2016-09-04 sampai 2018-10-17 sehinnga berikut perlakukan RFM Value untuk case ini:
analysis_date = date(max(customer_order$order_purchase_timestamp))+days(1)
customer_rfm <- customer_order %>%
group_by(customer_unique_id) %>%
mutate(first_order = min(order_purchase_timestamp),
last_order = max(order_purchase_timestamp),
min_product_price = min(price),
max_product_price = max(price),
total_price = sum(price),
total_freight = sum(freight_value),
recency = (analysis_date-date(last_order))
) %>%
slice(1) %>%
ungroup() %>%
select(customer_unique_id,customer_lat,customer_lng,customer_state_name,
first_order,last_order,last_order,min_product_price,
max_product_price,total_price,total_freight,recency) %>%
mutate(
min_product_price = ifelse(is.na(min_product_price),0,min_product_price),
max_product_price = ifelse(is.na(max_product_price),0,max_product_price),
total_price = ifelse(is.na(total_price),0,total_price),
total_freight = ifelse(is.na(total_freight),0,total_freight)
)
customer_rfm_freq <- customer_order %>%
select(customer_unique_id, order_id) %>%
distinct() %>%
group_by(customer_unique_id) %>%
summarise(
frequency = n()
) %>%
ungroup()
customer_rfm <- customer_rfm %>% left_join(
customer_rfm_freq,
by=c("customer_unique_id")) %>%
mutate(
recency = round(as.numeric(recency)),
monetary = total_price + total_freight
)
#summary(customer_rfm)
customer_rfm_result <- rfm_table_customer(customer_rfm, customer_unique_id, frequency, recency, monetary, analysis_date)
customer_rfm <- merge(customer_rfm, customer_rfm_result$rfm, by.x=c("customer_unique_id"), by.y=c("customer_id")) %>%
filter(customer_state_name %in% c("São Paulo","Rio de Janeiro","Minas Gerais","Rio Grande do Sul","Paraná")) %>%
distinct() %>%
select(-c(recency_days,transaction_count,amount))
#summary(customer_rfm)
wd <- as.character(getwd())
saveRDS(object=customer_rfm, file=paste(paste(wd,"/data_clean/",sep = ""),"customer_rfm.rds",sep=""))
Berikut 10 data teratas dari Customer RFM Model yang dibuat: Data-data selain RFM Value digunakan sebagai variabel pendukung untuk melakukan segmentasi dan rekomendasi.
Berikut visualisasi map-nya:
cof <- colorFactor(pal_one,
domain= unique(unique(customer_rfm$customer_state_name)))
labels <- sprintf(
"ID: %s <br/>State Name: %s <br>Recency: %g <br>Frequency: %g <br/>Monetary: R$ %g <br> RFM Score: %g",
customer_rfm$customer_unique_id,
customer_rfm$customer_state_name,
customer_rfm$recency,
customer_rfm$frequency,
customer_rfm$monetary,
customer_rfm$rfm_score
) %>% lapply(htmltools::HTML)
customer_rfm %>% leaflet() %>%
setView(-54.69142,-23.95,5) %>%
addTiles() %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
#addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
#addProviderTiles(provider = providers$HERE.mapLabels) %>%
#addProviderTiles(providers$Stamen.TonerLines) %>%
clearShapes() %>%
addCircles( lng = ~ customer_lng,
lat = ~ customer_lat,
label = labels,
popup = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
#weight = 3,
#radius=40,
color= ~cof(customer_state_name), stroke = TRUE, fillOpacity = 0.5) %>%
addLegend("bottomright", pal = cof, values = ~customer_state_name,
title = "State Name",
#labFormat = labelFormat(prefix = "$"),
opacity = 1
)