# load library
library(readxl)
library(ggplot2)
library(dplyr)
library(tidyr)
library(prettydoc)

Introduction

Every company or business entity that offers products or services to customers in the big data era is now turning to utilizing its customer data sources. The marketing concept is a marketing plan based on meeting the needs and desires of customers to achieve organizational goals. This makes customer needs something that needs to be fulfilled. Companies can collect more complete information about customers and potential customers from various sources, one of which is transactional data.

Read Data

Transactional data is data that contains transaction history. The transactional data components that exist in Kalbe Nutritionals is as follows.

sales <- readxl::read_xlsx("Data Morinaga-Zee Cleansing.xlsx", sheet = 3)
head(sales)

The description of each variable is outlined below;

KontakID
JoinDate
GROUP SM
CompletedDate
PurchaseDate
IDTransaksi
Source
Amount
AmountwithPPn
Qty
Channel
ChannelGroup
CityToko
ProvinceToko
LOB
Brand
SubBrand
Product
SKUProduct
Gender
BoD
Age2
Occupation
District
City
Province

To dig deeper insights, then we will look at the most spending made on the transaction data that we have

as.data.frame(sort(table(sales$Source), decreasing = TRUE))

Hanya mengambil beberapa source yang potensial saja

sales_clean <- sales[sales$Source == c("WHATSAPP", "SHOPEE", "TOKOPEDIA", "SHOPEE Susu_A", "Tokopedia Susu_A", "LAZADA", "JD.ID", "LAZADA Susu_A", "BUKALAPAK", "KALCARE"),]
str(sales_clean)

#> tibble [6,513 × 26] (S3: tbl_df/tbl/data.frame)
#>  $ KontakID     : chr [1:6513] "0507-0047164" "0506-0022783" "0506-0029200" "0506-0022783" ...
#>  $ JoinDate     : POSIXct[1:6513], format: "2011-10-05" "2014-04-05" ...
#>  $ GROUP SM     : chr [1:6513] "KHD" "KHD" "KHD" "KHD" ...
#>  $ CompletedDate: POSIXct[1:6513], format: "2020-05-29" "2021-02-03" ...
#>  $ PurchasedDate: POSIXct[1:6513], format: "2020-05-27" "2021-02-02" ...
#>  $ IDTransaksi  : chr [1:6513] "YE-013170" "ZB-000573" "ZA-006302" "ZJ-005871" ...
#>  $ Source       : chr [1:6513] "WHATSAPP" "WHATSAPP" "WHATSAPP" "WHATSAPP" ...
#>  $ Amount       : chr [1:6513] "231363.63" "157727.27" "360000.00" "456818.19" ...
#>  $ AmountwithPPn: chr [1:6513] "254499.99" "173500.00" "396000.00" "502500.01" ...
#>  $ Qty          : chr [1:6513] "1.00" "1.00" "3.00" "3.00" ...
#>  $ Channel      : chr [1:6513] "KHD - WHATSAPP" "KHD - WHATSAPP" "KHD - WHATSAPP" "KHD - WHATSAPP" ...
#>  $ ChannelGroup : chr [1:6513] "KHD" "KHD" "KHD" "KHD" ...
#>  $ CityToko     : chr [1:6513] "JAKARTA TIMUR" "JAKARTA TIMUR" "JAKARTA TIMUR" "JAKARTA TIMUR" ...
#>  $ ProvinceToko : chr [1:6513] "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" ...
#>  $ LOB          : chr [1:6513] "KN2" "KN2" "KN2" "KN2" ...
#>  $ Brand        : chr [1:6513] "Susu_A" "Susu_A" "Susu_A" "Susu_A" ...
#>  $ Subbrand     : chr [1:6513] "CHS" "CHM" "CHS" "CHK" ...
#>  $ Product      : chr [1:6513] "Susu_A SCHOOL REGULER" "Susu_A MIL SOYA" "Susu_A SCHOOL REGULER" "Susu_A KID SOYA" ...
#>  $ SKUProduct   : chr [1:6513] "Susu_A SCHOOL REGULER VANILLA 4X400 G R16" "Susu_A MIL SOYA 600 G" "Susu_A SCHOOL REGULER VANILLA 2 X 400 G" "Susu_A KID SOYA 2X300 G" ...
#>  $ Gender       : chr [1:6513] "Perempuan" "Perempuan" "Perempuan" "Perempuan" ...
#>  $ BoD          : chr [1:6513] "2005-07-08 16:43:06.973000000" "2005-06-08 13:56:28.350000000" "2005-06-15 15:18:06.287000000" "2005-06-08 13:56:28.350000000" ...
#>  $ Age2         : num [1:6513] 16.5 16.6 16.5 16.6 16.5 ...
#>  $ Occupation   : chr [1:6513] NA "IBU RUMAH TANGGA" "PEGAWAI SWASTA" "IBU RUMAH TANGGA" ...
#>  $ District     : chr [1:6513] "PESANGGRAHAN" "SAWAH BESAR" "MAKASAR" "SAWAH BESAR" ...
#>  $ City         : chr [1:6513] "JAKARTA SELATAN" "JAKARTA PUSAT" "JAKARTA TIMUR" "JAKARTA PUSAT" ...
#>  $ Province     : chr [1:6513] "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" ...

We will only focus on the variables that will be used and remove redundant variables

sales_clean <- subset(sales_clean, select = -c(Amount, BoD, CityToko, ProvinceToko, Channel, ChannelGroup))

Cleansing Data

Explicit Coercion

# mengubah tipe data factor
sales_clean[,c("GROUP SM", "LOB", "Brand", "Subbrand", "Gender", "Occupation", "City", "Province")] <- lapply(sales_clean[,c("GROUP SM", "LOB", "Brand", "Subbrand", "Gender", "Occupation", "City", "Province")], as.factor)

# mengubah tipe data integer
sales_clean[,c("Qty", "Age2")] <- lapply(sales_clean[,c("Qty", "Age2")], as.integer)

# mengubah tipe data numerik
sales_clean$AmountwithPPn <- as.numeric(sales_clean$AmountwithPPn)

str(sales_clean)

#> tibble [6,513 × 20] (S3: tbl_df/tbl/data.frame)
#>  $ KontakID     : chr [1:6513] "0507-0047164" "0506-0022783" "0506-0029200" "0506-0022783" ...
#>  $ JoinDate     : POSIXct[1:6513], format: "2011-10-05" "2014-04-05" ...
#>  $ GROUP SM     : Factor w/ 16 levels "CHAKRA","DMQI",..: 8 8 8 8 8 8 8 8 10 8 ...
#>  $ CompletedDate: POSIXct[1:6513], format: "2020-05-29" "2021-02-03" ...
#>  $ PurchasedDate: POSIXct[1:6513], format: "2020-05-27" "2021-02-02" ...
#>  $ IDTransaksi  : chr [1:6513] "YE-013170" "ZB-000573" "ZA-006302" "ZJ-005871" ...
#>  $ Source       : chr [1:6513] "WHATSAPP" "WHATSAPP" "WHATSAPP" "WHATSAPP" ...
#>  $ AmountwithPPn: num [1:6513] 254500 173500 396000 502500 43500 ...
#>  $ Qty          : int [1:6513] 1 1 3 3 1 1 6 2 1 3 ...
#>  $ LOB          : Factor w/ 2 levels "KN2","KN4": 1 1 1 1 2 1 1 2 1 1 ...
#>  $ Brand        : Factor w/ 2 levels "Susu_A","Susu_B": 1 1 1 1 2 1 1 2 1 1 ...
#>  $ Subbrand     : Factor w/ 11 levels "BMT","CHGM","CHGP",..: 6 5 6 4 10 5 2 10 6 6 ...
#>  $ Product      : chr [1:6513] "Susu_A SCHOOL REGULER" "Susu_A MIL SOYA" "Susu_A SCHOOL REGULER" "Susu_A KID SOYA" ...
#>  $ SKUProduct   : chr [1:6513] "Susu_A SCHOOL REGULER VANILLA 4X400 G R16" "Susu_A MIL SOYA 600 G" "Susu_A SCHOOL REGULER VANILLA 2 X 400 G" "Susu_A KID SOYA 2X300 G" ...
#>  $ Gender       : Factor w/ 4 levels "Laki-laki","Laki-Laki",..: 3 3 3 3 3 3 3 3 3 3 ...
#>  $ Age2         : int [1:6513] 16 16 16 16 16 16 16 16 16 16 ...
#>  $ Occupation   : Factor w/ 22 levels "BEAUTY AMBASSADOR",..: NA 10 14 10 NA 10 NA 10 14 14 ...
#>  $ District     : chr [1:6513] "PESANGGRAHAN" "SAWAH BESAR" "MAKASAR" "SAWAH BESAR" ...
#>  $ City         : Factor w/ 29 levels "BANDUNG","BANDUNG BARAT",..: 15 14 16 14 16 14 15 14 17 16 ...
#>  $ Province     : Factor w/ 3 levels "BANTEN","DKI JAKARTA",..: 2 2 2 2 2 2 2 2 2 2 ...

Handling Missing Values

Now let’s look at the number of missing values in our data.

colSums(is.na(sales_clean))

#>      KontakID      JoinDate      GROUP SM CompletedDate PurchasedDate 
#>             8             8            26             8             8 
#>   IDTransaksi        Source AmountwithPPn           Qty           LOB 
#>             8             8             8             8             8 
#>         Brand      Subbrand       Product    SKUProduct        Gender 
#>             8             8             8             8            21 
#>          Age2    Occupation      District          City      Province 
#>             8          2923           999            44             8

Performs handling missing values by taking only rows that are not missing.

sales_clean <- sales_clean[complete.cases(sales_clean), ]

Check again whether there are still missing rows in the data

anyNA(sales_clean)

#> [1] FALSE

Duplicated Data

First of all, we’ll check for duplicate data. This can be done with the duplicated() function. To see the number of duplicate rows, we can wrap the duplicated() function with the sum() function.

sum(duplicated(sales_clean))

#> [1] 0

The 0 above indicates that there are no identical rows.

Data Discretization

A data may have one of several types, such as numeric or categorical. Numerical data discretization is one of the most influential data processing steps in data mining. The goal of data discretization is to find a concise representation of the data in the form of categories that are adequate for analysis while retaining as much information as possible. An example of data discretization can be seen in Figure below, where changes are made by discretizing set values from numerical form into categories with certain intervals.

Furthermore, we will discretizing Age2 variables.

convert_age <- function(y){ 
    if(y <= 19)
      {y <- "<20 tahun"}
    else if(y > 20 & y <= 35)
      {y <- "20-35 tahun" }
    else if(y > 35 & y <= 50)
      {y <- "36-50 tahun" }
    else if(y > 50 & y <= 65)
      {y <- "51-65 tahun"}
    else
      {y <- "65 tahun"}  
}

sales_clean$age_cat <- sapply(X = sales_clean$Age2, 
                            FUN = convert_age) 
head(sales_clean$age_cat)

#> [1] "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun"

Working with `Lubridate`

Taking a quick peek at the JoinDate, CompletedDate, and PurchasedDate columns reveals that the date values are stored in a year-month-day format, with - (dash) being the delimiter:

head(sales_clean$JoinDate)

#> [1] "2014-04-05 UTC" "2011-10-04 UTC" "2014-04-05 UTC" "2014-04-05 UTC"
#> [5] "2014-04-05 UTC" "2020-05-11 UTC"

head(sales_clean$CompletedDate)

#> [1] "2021-02-03 UTC" "2021-01-14 UTC" "2021-10-22 UTC" "2021-02-25 UTC"
#> [5] "2021-12-11 UTC" "2021-04-20 UTC"

head(sales_clean$PurchasedDate)

#> [1] "2021-02-02 UTC" "2021-01-13 UTC" "2021-10-13 UTC" "2021-02-24 UTC"
#> [5] "2021-12-09 UTC" "2021-04-19 UTC"

Time data manipulation using the lubridate library

A brief use of lubridate can be seen in the cheatsheet.

library(lubridate)

JoinDate Column

sales_clean$JoinDate <- ymd(sales_clean$JoinDate)

head(sales_clean$JoinDate)

#> [1] "2014-04-05" "2011-10-04" "2014-04-05" "2014-04-05" "2014-04-05"
#> [6] "2020-05-11"

We can change the timezone List of time zones: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones

We change the timezone to Jakarta

head(ymd(sales_clean$JoinDate, tz = "Asia/Jakarta"))

#> [1] "2014-04-05 WIB" "2011-10-04 WIB" "2014-04-05 WIB" "2014-04-05 WIB"
#> [5] "2014-04-05 WIB" "2020-05-11 WIB"

However, since our data is of type YYYY-MM-DD the timezone change isn’t too significant unless the data type contains time.

CompletedDate Column

sales_clean$CompletedDate <- ymd(sales_clean$CompletedDate)

class(sales_clean$CompletedDate)

#> [1] "Date"

PurchasedDate Column

sales_clean$PurchasedDate <- ymd(sales_clean$PurchasedDate, tz = "Asia/Jakarta")

class(sales_clean$PurchasedDate)

#> [1] "POSIXct" "POSIXt"

PurchasedDate is a column that contains when a customer buys a product. Therefore, it is far more insightful if we can extract some of the components in it.

Task 1: Extract Year from PurchasedDate

sales_clean$year_purchased <- year(sales_clean$PurchasedDate)
head(sales_clean$year_purchased)

#> [1] 2021 2021 2021 2021 2021 2021

Task 2: Extract Month from PurchasedDate

sales_clean$month_purchased <- month(sales_clean$PurchasedDate, label = T, abbr = F)
head(sales_clean$month_purchased)

#> [1] February January  October  February December April   
#> 12 Levels: January < February < March < April < May < June < ... < December

Task 3: Extract day name from PurchasedDate

sales_clean$day_purchased <- wday(sales_clean$PurchasedDate, label = T, abbr = F)
head(sales_clean$day_purchased)

#> [1] Tuesday   Wednesday Wednesday Wednesday Thursday  Monday   
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday

Task 4: Create a column containing how long the customer has been from the date he joined to his last purchase

For advanced analysis, we will only use purchase data when that purchase last occurred to reduce data redundancies. For that we can use unique() and match().

sales_unique <- sales_clean[match(unique(sales_clean$IDTransaksi),sales_clean$IDTransaksi),]

Data Exploration

Customer Profile Analysis

library(ggplot2)

Sebelum membuat visualisasi, penting untuk melakukan aggregasi data yang tepat. Akan coba dibuat persebaran jenis kelamin yang berbelanja di beberapa channel pembelian susu.

Gender

Mari kita buat dataframe dari tabel frekuensi dari Gender

table(sales_clean$Gender)

#> 
#> Laki-laki Laki-Laki Perempuan PEREMPUAN 
#>       113         0      3078         0

Dapat dilihat bahwa terdapat beberapa kategori yang redundan, hal ini akan coba ditakeout dari dataframe

gender_freq <- as.data.frame(table(sales_clean$Gender))
gender_freq <- gender_freq[gender_freq$Var1 %in% c("Laki-laki","Perempuan"), ]
gender_freq

ggplot(data = gender_freq, mapping = aes(x = Var1, y = Freq)) +
  geom_col(fill = "pink") +
  geom_label(aes(label = Freq), col = "black") +
  labs(title = "Demografi Usia pelanggan Susu", subtitle = "Pada Channel E-Commerce di Indonesia", x = "Gender", y = "Total") +
  theme_minimal()

gender_freq2 <- as.data.frame(table(sales_clean$Gender, sales_clean$Brand))
gender_freq2 <- gender_freq2[gender_freq2$Var1 %in% c("Laki-laki","Perempuan"), ]
gender_freq2

ggplot(data = gender_freq2, mapping = aes(x = Var1, y = Freq, fill = Var2)) +
  geom_col()+
  geom_label(aes(label = Freq), col = "black") +
  labs(title = "Demografi Usia pelanggan Susu", subtitle = "Pada Channel E-Commerce di Indonesia", x = "Gender", y = "Total") +
  facet_wrap(~ Var2, ncol = 2) +
  scale_fill_manual(values=c('tomato', 'red')) +
  theme(legend.position = 'none')

Age

age_freq <- as.data.frame(table(sales_clean$age_cat))
age_freq

ggplot(data = age_freq, mapping = aes(x = Var1, y = Freq)) +
  geom_col(fill = "tomato") +
  labs(title = "Age Customer Distribution",
       subtitle = "From e-commerce Channel",
       x = "Rentang usia",
       y = NULL) +
  theme_minimal() +
  theme(plot.title = element_text(colour = "darkgreen", hjust = 0.5, face = "bold", size = 20),
        plot.subtitle = element_text(hjust = 0.5))

District

Top 10 District from Valuable Customer

district_freq <- as.data.frame(table(sales_clean$District))

# ordering
district_freq <- district_freq[order(district_freq$Freq, decreasing = T), ]
names(district_freq) <- c("District", "Freq")
district_freq

Membuat Visualisasi

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen")

Ingin difokuskan untuk District Kembangan saja, sehingga perlu dihighligt dengan warna yang berbeda

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,])

Tambahkan label hanya untuk Top 3 District teratas

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,]) +
  geom_label(data = district_freq[1:3,], 
             aes(label = Freq), nudge_x = 2)

Tambahkan garis yang menunjukan nilai rata-rata dari keseluruhan Frekuensi, kemudian berikan label berisikan nilai rata-rata tersebut.

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,]) +
  geom_label(data = district_freq[1:3,], 
             aes(label = Freq), nudge_x = 2) +
  geom_vline(xintercept = mean(district_freq$Freq),
             linetype = 2,
             lwd = 1,
             color = "tomato") + # memberikan garis mean
  geom_label(aes(label = round(mean(district_freq$Freq),2),
                 x = 19,
                 y = "DUREN SAWIT"),
             fill = "tomato") # memberikan label mean

Menambahkan ticks mark pada sumbu x

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,]) +
  geom_label(data = district_freq[1:3,], 
             aes(label = Freq), nudge_x = 2) +
  geom_vline(xintercept = mean(district_freq$Freq),
             linetype = 2,
             lwd = 1,
             color = "tomato") + # memberikan garis mean
  geom_label(aes(label = round(mean(district_freq$Freq),2),
                 x = 19,
                 y = "DUREN SAWIT"),
             fill = "tomato") + # memberikan label mean
  scale_x_continuous(breaks = seq(0, 140, 20))

Percantik plot dengan labs()

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,]) +
  geom_label(data = district_freq[1:3,], 
             aes(label = Freq), nudge_x = 2) +
  geom_vline(xintercept = mean(district_freq$Freq),
             linetype = 2,
             lwd = 1,
             color = "tomato") + # memberikan garis mean
  geom_label(aes(label = round(mean(district_freq$Freq),2),
                 x = 19,
                 y = "DUREN SAWIT"),
             fill = "tomato")+ # memberikan label mean
  scale_x_continuous(breaks = seq(0, 140, 20)) +
  labs(title = "Top 10 District",
       subtitle = "Based on Customer Frequency",
       x = "Frequency",
       y = NULL)

Percantik plot dengan theme()

ggplot(data = district_freq[1:10, ], 
       aes(x = Freq, y = reorder(District, Freq))) +
  geom_col(fill = "darkseagreen") +
  geom_col(fill = "darkgreen", data = district_freq[2,]) +
  geom_label(data = district_freq[1:3,], 
             aes(label = Freq), nudge_x = 2) +
  geom_vline(aes(xintercept = mean(district_freq$Freq)),
             linetype = 2,
             lwd = 1,
             color = "tomato") + # memberikan garis mean
  geom_label(aes(label = round(mean(district_freq$Freq),2),
                 x = 19,
                 y = "DUREN SAWIT"),
             fill = "tomato")+ # memberikan label mean
  scale_x_continuous(breaks = seq(0, 140, 20)) +
  labs(title = "Top 10 District",
       subtitle = "Based on Customer Frequency",
       x = "Frequency",
       y = NULL) +
  theme_minimal() +
  theme(axis.text.y = element_text(face = "bold", color = "dimgrey"),
        plot.title = element_text(face = "bold", size = 15))

Province

province_freq <- as.data.frame(table(sales_clean$Province))

# Diurutkan
province_freq <- province_freq[order(province_freq$Freq, decreasing = T), ]

# Melakukan pengubahan nama kolom
names(province_freq) <- c("Province", "Freq")

province_freq

ggplot(province_freq, aes(x = reorder(Province, -Freq), y = Freq)) +
  geom_bar(stat = "identity", aes(fill = Province), show.legend = F, width = 0.6) +
  scale_fill_brewer(palette = 5) +
  geom_text(aes(label = Freq), nudge_y = 50) +
  labs(title = "Province Customer Distribution",
       subtitle = "from e-commerce channel",
       y = NULL,
       x = "Province") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold")
        )

Business Question: Ingin diketahui distribusi nilai AmountwithPPn < 2000000 dan > 1000000 untuk masing-masing Province menggunakan boxplot.

province_agg <- sales_clean[sales_clean$AmountwithPPn <= 2000000 & sales_clean$AmountwithPPn >= 1000000,]
head(province_agg)

Visualisasi boxplot dan jitter yang berisikan nilai Qty

ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
  geom_boxplot(outlier.color = "red") +
  geom_jitter(aes(size = Qty))

Memberikan warna

ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
  geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
  geom_jitter(aes(size = Qty, color = Province)) +
  scale_color_brewer(palette = 8) +
  theme_minimal()

Mempercantik Plot

ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
  geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
  geom_jitter(aes(size = Qty, color = Province)) +
  scale_color_brewer(palette = 8) +
  labs(title = "Boxplot Distribution of Province",
       subtitle = "Based on Amount With PPN & Quantity",
       x = NULL,
       y = "Amount with PPN (in Rupiah)",
       size = "Quantity") +
  theme_minimal()

Menambahkan koma pada axis y untuk satuan Rupiah

library(scales)
ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
  geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
  geom_jitter(aes(size = Qty, color = Province)) +
  scale_color_brewer(palette = 8) +
  scale_y_continuous(labels = comma) +
  labs(title = "Boxplot Distribution of Province",
       subtitle = "Based on Amount With PPN & Quantity",
       x = NULL,
       y = "Amount with PPN (in Rupiah)",
       size = "Quantity") +
  theme_minimal()

Insight:

Jawa Barat merupakan provinsi dengan jumlah Amount with PPN yang paling bervariatif di antara lainnya.
Berdasarkan nilai median, Banten merupakan Provinsi dengan Amount with PPN tertinggi

Occupation

Top 10 Occupation

Persiapkan datanya

occ_freq <- as.data.frame(table(sales_clean$Occupation))
occ_freq <- occ_freq[order(occ_freq$Freq, decreasing = T), ]

Top 10 Occupation

occ_freq_5 <- head(occ_freq, 10)
names(occ_freq_5) <- c("Occupation", "Freq")
occ_freq_5

Visualisasi

ggplot(data = occ_freq_5, mapping = aes(x = Freq, y = reorder(Occupation, Freq))) +
  geom_col(aes(fill = Freq), show.legend = F) +
  scale_fill_gradient(low = "lightsalmon", high = "darkred") +
  labs(title = "Customer Occupation",
       subtitle = "from e-commerce channel",
       x = "Frequency",
       y = NULL) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.4),
        plot.subtitle = element_text(hjust = 0.4))

Apabila ingin ditampilkan nilai proporsi pada visualisasi

occ_prop <- as.data.frame(prop.table(table(sales_clean$Occupation)))
occ_prop <- occ_prop[order(occ_prop$Freq, decreasing = T), ]
occ_prop_10 <- head(occ_prop, 10)
names(occ_prop_10) <- c("Occupation", "Prop")
occ_prop_10

ggplot(data = occ_prop_10, mapping = aes(x = Prop, y = reorder(Occupation, Prop))) +
  geom_col(aes(fill = Prop), show.legend = F) +
  scale_fill_gradient(low = "lightsalmon", high = "darkred") +
  geom_label(aes(label = scales::percent(round(Prop, 4))), size = 3, nudge_x = 0.02) +
  scale_x_continuous(labels = scales::percent) + # menambahkan label pada axis
  labs(title = "Customer Occupation Percentage",
       subtitle = "from e-commerce channel",
       x = "Percentage",
       y = NULL) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.4),
        plot.subtitle = element_text(hjust = 0.4))

Source

Top 5 Source Detail

source_freq <- as.data.frame(table(sales_clean$Source))
source_freq <- source_freq[order(source_freq$Freq, decreasing = T), ]
source_freq <- head(source_freq, 5)
names(source_freq) <- c("Source", "Freq")
source_freq

Visualization

ggplot(data = source_freq, mapping = aes(x = Freq, y = reorder(Source, Freq))) +
  geom_col(aes(fill = Freq), show.legend = F) +
  scale_fill_gradient(low = "plum", high = "plum4") +
  geom_text(aes(label = Freq), nudge_x = 80) +
  labs(title = "Customer by Source",
       subtitle = "from e-commerce channel",
       x = "Frequency",
       y = NULL) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.4),
        plot.subtitle = element_text(hjust = 0.4))

Customer Revenue Analysis

By analyzing customer revenue patterns, businesses can make informed decisions, enhance customer satisfaction, and drive long-term growth. This report aims to provide a comprehensive overview of customer revenue analysis, its significance, methodologies, and the actionable insights it can offer to organizations

Monthly Trend Sales

Akan dilihat bagaimana median trend Sales setiap bulan dan tahun hanya untuk tahun 2020 dan 2021 berdasarkan AmountwithPPn

agg_sales <- aggregate(AmountwithPPn ~ year_purchased + month_purchased,
                       data = sales_clean,
                       FUN = median)
agg_sales <- agg_sales[agg_sales$year_purchased %in% c(2020, 2021), ]
agg_sales$year_purchased <- as.factor(agg_sales$year_purchased)

Visualisasi dengan geom_line

ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
  geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
  geom_point(aes(color = year_purchased), size = 2) +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_color_brewer(palette = "Set2")

Hanya ingin diberikan label untuk bulan july

ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
  geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
  geom_point(aes(color = year_purchased), size = 2) +
  geom_label(data = agg_sales[agg_sales$month_purchased == 'July', ], 
             aes(label = scales::comma(round(AmountwithPPn, 1))), 
             size = 2, 
             nudge_y = 10000) +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_y_continuous(labels = scales::comma) +
  scale_color_brewer(palette = "Set2")

Percantik plot

ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
  geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
  geom_point(aes(color = year_purchased), size = 2) +
  geom_label(data = agg_sales[agg_sales$month_purchased == 'July', ], 
             aes(label = scales::comma(round(AmountwithPPn, 1))), 
             size = 2, 
             nudge_y = 10000) +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_y_continuous(labels = scales::comma) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "Monthly Trend Sales Revenue",
       subtitle = "in 2020 and 2021",
       y = "Month",
       x = "Amount of Sales", 
       color = "Year") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

Contribution Sales by Source (in Percentage)

Step by Step:

Persiapkan Data

source_agg <- aggregate(AmountwithPPn ~ Source,
                        data = sales_clean,
                        FUN = sum)
source_agg <- source_agg[order(source_agg$AmountwithPPn, decreasing = T), ]
source_agg$percentage <- source_agg$AmountwithPPn/sum(source_agg$AmountwithPPn)
source_agg

ggplot(data = source_agg, aes(x = percentage, y = reorder(Source, percentage))) +
  geom_col(aes(fill = percentage), show.legend = F) +
  geom_label(aes(label = scales::percent(round(percentage,4))), size = 3, nudge_x = 0.05) +
  scale_fill_gradient(low = "peru", high = "moccasin") +
  scale_x_continuous(label = scales::percent) +
  labs(title = "Contribution Sales by Source",
       subtitle = "in percentage (%)",
       y = "Source") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.35, face = "bold"),
        plot.subtitle = element_text(hjust = 0.35))

Contribution Sales by Product

product_agg <- aggregate(AmountwithPPn ~ Product + Brand,
                        data = sales_clean,
                        FUN = sum)
product_agg <- product_agg[order(product_agg$AmountwithPPn, decreasing = T), ]
product_agg

Membuat multivariate plot

library(scales)
ggplot(data = product_agg, mapping = aes(y = reorder(Product,AmountwithPPn), x = AmountwithPPn)) +
  geom_col(aes(fill = Brand), show.legend = F) +
  geom_text(aes(label = comma(AmountwithPPn)), size = 2) +
  facet_wrap(~Brand, ncol = 2, scales = "free") +
  # untuk mengatur wraping nilai agar tidak terlalu panjang
  scale_y_discrete(label = label_wrap(20)) +
  # untuk mengatur agar skala x ada koma
  scale_x_continuous(label = comma) +
  # untuk mengatur ukuran axis x
  theme(axis.text.y = element_text(size = 7),
        axis.text.x = element_text(size = 5)) +
  labs(title = "Contribution Sales by Product",
       subtitle = "in Total",
       y = "Product Name",
       x = NULL)

Contribution Sales by City

Kita akan coba melakukan agregasi data dengan menampilkan nilai rata-rata dari sales/revenue untuk province jakarta saja.

# melakukan filtering hanya untuk Province DKI Jakarta
city_jakarta <- sales_clean[sales_clean$Province == "DKI JAKARTA", ]

# Melakukan Aggregate dan pembentukan percent
city_agg1 <- aggregate(AmountwithPPn ~ City + Brand,
          data = city_jakarta,
          FUN = mean)
city_agg1 <- transform(city_agg1,
                       perc = ave(AmountwithPPn,
                                  City,
                                  FUN = prop.table))
city_agg1

In Percentage Multivariate Plot

ggplot(data = city_agg1, mapping = aes(x = AmountwithPPn, y = City))+
  geom_col(aes(group = Brand, fill = Brand), position = position_fill(reverse = T)) +
  scale_x_continuous(label = percent) +
  geom_text(aes(label = percent(perc)), position = position_fill(vjust = 0.5), size = 3.1) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal()

Contribution Sales by Province

# Melakukan Aggregate dan pembentukan percent
prov_agg1 <- aggregate(AmountwithPPn ~ Province + Brand,
          data = sales_clean,
          FUN = mean)
prov_agg1

ggplot(data = prov_agg1, aes(x = Province, y = AmountwithPPn, fill = Brand)) +
  geom_col(aes(group = Brand), position = "dodge") +
  geom_text(aes(label = comma(x = AmountwithPPn, prefix = "Rp.", suffix = ",-")), 
            position = position_dodge(0.9), vjust = -0.5, size = 2.9) +
  scale_y_continuous(labels = comma, breaks = seq(0,600000,100000), limits = c(0, 500000)) +
  theme_minimal()

Spatial analysis using leaflet

library(leaflet)

city_all <- aggregate(AmountwithPPn ~ City,
          data = sales_clean,
          FUN = mean)
city_all$percent <- city_all$AmountwithPPn/sum(city_all$AmountwithPPn)
city_all

# menambahkan data latitude_longitude
latlong <- data.frame("lat" = c(-6.9147444, -6.8937121, -6.233333, -6.6, -7.3333333, -6.8172531, -6.0169825, -6.880239, -6.715534, -6.39, -7.227906,    -6.336315, -6.211544, -6.211544,    -6.332973, -6.211544, -6.211544, -6.3227303, -6.9833333, -6.8531026, -6.314835, -6.5386806, -6.12009, -6.569361, -6.92405,  -6.8329, -6.1783056, -6.2888889, -7.327954), 
                         
                         "long" = c(107.6098111, 107.4321959, 107, 106.8, 108.35, 107.1307289, 106.040506, 107.5355, 108.564003, 106.83, 107.908699, 108.325104, 106.845172, 106.845172, 106.807915, 106.845172, 106.845172, 107.3375791, 108.4833333, 108.2258897, 106.103897, 107.4499404, 106.150299,    107.752403, 106.922203, 107.9532, 106.6318889,  106.7180556, 108.214104))

# menggabungkan dua kolom data
city_all_clean <- cbind(city_all, latlong)

library(maps)
global <- map_data("world")
world <- ggplot() + 
  geom_polygon(data = global, aes(x=long, y = lat, group = group), 
               fill = "gray85", color = "gray80") +
  coord_fixed(1.3)

Indo <- world + xlim(94,142) + ylim(-11,7.5)

library(ggrepel)
world + xlim(105,115) + ylim(-9,-5.8)+ 
  geom_point(data = city_all_clean, aes(x = long, y = lat, size = AmountwithPPn, color = AmountwithPPn),
             alpha = 0.5, 
             show.legend = F) +
  scale_color_gradient(low = "yellow", high = "red") +
  geom_text_repel(data = city_all_clean, aes(x = long, y = lat, label= City), size = 2)

library(htmltools)
library(dplyr)
library(leaflet)
m <- leaflet(city_all_clean)


pop <- paste("Region: ", city_all_clean$City, "<br/>",
             "Revenue: ", round(city_all_clean$AmountwithPPn)) %>% 
  lapply(htmltools::HTML)

# mendefinisikan variabel yang menjadi dasar pewarnaan
col <- city_all_clean$AmountwithPPn

mybins <- seq(50000, 700000, by=150000)
pal <- colorBin( palette="YlOrRd", 
                       domain=col, 
                       bins=mybins)

m %>% 
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  addCircleMarkers(~long, ~lat, 
                   fillColor = ~pal(col),
                   weight = 1,
                   color = "white",
                   fillOpacity = 0.8,
                   radius=8, 
                   stroke=FALSE,
                   label = pop,) %>%
  addLegend("bottomright", 
            pal = pal,
            values = ~col,
            title = "Revenue",
            labFormat = labelFormat(digits = 2),
            opacity = 1)