Introduction
Every company or business entity that offers products or services to customers in the big data era is now turning to utilizing its customer data sources. The marketing concept is a marketing plan based on meeting the needs and desires of customers to achieve organizational goals. This makes customer needs something that needs to be fulfilled. Companies can collect more complete information about customers and potential customers from various sources, one of which is transactional data.
Read Data
Transactional data is data that contains transaction history. The transactional data components that exist in Kalbe Nutritionals is as follows.
The description of each variable is outlined below;
KontakID
JoinDate
GROUP SM
CompletedDate
PurchaseDate
IDTransaksi
Source
Amount
AmountwithPPn
Qty
Channel
ChannelGroup
CityToko
ProvinceToko
LOB
Brand
SubBrand
Product
SKUProduct
Gender
BoD
Age2
Occupation
District
City
Province
To dig deeper insights, then we will look at the most spending made on the transaction data that we have
Hanya mengambil beberapa source
yang potensial saja
sales_clean <- sales[sales$Source == c("WHATSAPP", "SHOPEE", "TOKOPEDIA", "SHOPEE Susu_A", "Tokopedia Susu_A", "LAZADA", "JD.ID", "LAZADA Susu_A", "BUKALAPAK", "KALCARE"),]
str(sales_clean)
#> tibble [6,513 × 26] (S3: tbl_df/tbl/data.frame)
#> $ KontakID : chr [1:6513] "0507-0047164" "0506-0022783" "0506-0029200" "0506-0022783" ...
#> $ JoinDate : POSIXct[1:6513], format: "2011-10-05" "2014-04-05" ...
#> $ GROUP SM : chr [1:6513] "KHD" "KHD" "KHD" "KHD" ...
#> $ CompletedDate: POSIXct[1:6513], format: "2020-05-29" "2021-02-03" ...
#> $ PurchasedDate: POSIXct[1:6513], format: "2020-05-27" "2021-02-02" ...
#> $ IDTransaksi : chr [1:6513] "YE-013170" "ZB-000573" "ZA-006302" "ZJ-005871" ...
#> $ Source : chr [1:6513] "WHATSAPP" "WHATSAPP" "WHATSAPP" "WHATSAPP" ...
#> $ Amount : chr [1:6513] "231363.63" "157727.27" "360000.00" "456818.19" ...
#> $ AmountwithPPn: chr [1:6513] "254499.99" "173500.00" "396000.00" "502500.01" ...
#> $ Qty : chr [1:6513] "1.00" "1.00" "3.00" "3.00" ...
#> $ Channel : chr [1:6513] "KHD - WHATSAPP" "KHD - WHATSAPP" "KHD - WHATSAPP" "KHD - WHATSAPP" ...
#> $ ChannelGroup : chr [1:6513] "KHD" "KHD" "KHD" "KHD" ...
#> $ CityToko : chr [1:6513] "JAKARTA TIMUR" "JAKARTA TIMUR" "JAKARTA TIMUR" "JAKARTA TIMUR" ...
#> $ ProvinceToko : chr [1:6513] "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" ...
#> $ LOB : chr [1:6513] "KN2" "KN2" "KN2" "KN2" ...
#> $ Brand : chr [1:6513] "Susu_A" "Susu_A" "Susu_A" "Susu_A" ...
#> $ Subbrand : chr [1:6513] "CHS" "CHM" "CHS" "CHK" ...
#> $ Product : chr [1:6513] "Susu_A SCHOOL REGULER" "Susu_A MIL SOYA" "Susu_A SCHOOL REGULER" "Susu_A KID SOYA" ...
#> $ SKUProduct : chr [1:6513] "Susu_A SCHOOL REGULER VANILLA 4X400 G R16" "Susu_A MIL SOYA 600 G" "Susu_A SCHOOL REGULER VANILLA 2 X 400 G" "Susu_A KID SOYA 2X300 G" ...
#> $ Gender : chr [1:6513] "Perempuan" "Perempuan" "Perempuan" "Perempuan" ...
#> $ BoD : chr [1:6513] "2005-07-08 16:43:06.973000000" "2005-06-08 13:56:28.350000000" "2005-06-15 15:18:06.287000000" "2005-06-08 13:56:28.350000000" ...
#> $ Age2 : num [1:6513] 16.5 16.6 16.5 16.6 16.5 ...
#> $ Occupation : chr [1:6513] NA "IBU RUMAH TANGGA" "PEGAWAI SWASTA" "IBU RUMAH TANGGA" ...
#> $ District : chr [1:6513] "PESANGGRAHAN" "SAWAH BESAR" "MAKASAR" "SAWAH BESAR" ...
#> $ City : chr [1:6513] "JAKARTA SELATAN" "JAKARTA PUSAT" "JAKARTA TIMUR" "JAKARTA PUSAT" ...
#> $ Province : chr [1:6513] "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" "DKI JAKARTA" ...
We will only focus on the variables that will be used and remove redundant variables
Cleansing Data
Explicit Coercion
# mengubah tipe data factor
sales_clean[,c("GROUP SM", "LOB", "Brand", "Subbrand", "Gender", "Occupation", "City", "Province")] <- lapply(sales_clean[,c("GROUP SM", "LOB", "Brand", "Subbrand", "Gender", "Occupation", "City", "Province")], as.factor)
# mengubah tipe data integer
sales_clean[,c("Qty", "Age2")] <- lapply(sales_clean[,c("Qty", "Age2")], as.integer)
#> tibble [6,513 × 20] (S3: tbl_df/tbl/data.frame)
#> $ KontakID : chr [1:6513] "0507-0047164" "0506-0022783" "0506-0029200" "0506-0022783" ...
#> $ JoinDate : POSIXct[1:6513], format: "2011-10-05" "2014-04-05" ...
#> $ GROUP SM : Factor w/ 16 levels "CHAKRA","DMQI",..: 8 8 8 8 8 8 8 8 10 8 ...
#> $ CompletedDate: POSIXct[1:6513], format: "2020-05-29" "2021-02-03" ...
#> $ PurchasedDate: POSIXct[1:6513], format: "2020-05-27" "2021-02-02" ...
#> $ IDTransaksi : chr [1:6513] "YE-013170" "ZB-000573" "ZA-006302" "ZJ-005871" ...
#> $ Source : chr [1:6513] "WHATSAPP" "WHATSAPP" "WHATSAPP" "WHATSAPP" ...
#> $ AmountwithPPn: num [1:6513] 254500 173500 396000 502500 43500 ...
#> $ Qty : int [1:6513] 1 1 3 3 1 1 6 2 1 3 ...
#> $ LOB : Factor w/ 2 levels "KN2","KN4": 1 1 1 1 2 1 1 2 1 1 ...
#> $ Brand : Factor w/ 2 levels "Susu_A","Susu_B": 1 1 1 1 2 1 1 2 1 1 ...
#> $ Subbrand : Factor w/ 11 levels "BMT","CHGM","CHGP",..: 6 5 6 4 10 5 2 10 6 6 ...
#> $ Product : chr [1:6513] "Susu_A SCHOOL REGULER" "Susu_A MIL SOYA" "Susu_A SCHOOL REGULER" "Susu_A KID SOYA" ...
#> $ SKUProduct : chr [1:6513] "Susu_A SCHOOL REGULER VANILLA 4X400 G R16" "Susu_A MIL SOYA 600 G" "Susu_A SCHOOL REGULER VANILLA 2 X 400 G" "Susu_A KID SOYA 2X300 G" ...
#> $ Gender : Factor w/ 4 levels "Laki-laki","Laki-Laki",..: 3 3 3 3 3 3 3 3 3 3 ...
#> $ Age2 : int [1:6513] 16 16 16 16 16 16 16 16 16 16 ...
#> $ Occupation : Factor w/ 22 levels "BEAUTY AMBASSADOR",..: NA 10 14 10 NA 10 NA 10 14 14 ...
#> $ District : chr [1:6513] "PESANGGRAHAN" "SAWAH BESAR" "MAKASAR" "SAWAH BESAR" ...
#> $ City : Factor w/ 29 levels "BANDUNG","BANDUNG BARAT",..: 15 14 16 14 16 14 15 14 17 16 ...
#> $ Province : Factor w/ 3 levels "BANTEN","DKI JAKARTA",..: 2 2 2 2 2 2 2 2 2 2 ...
Handling Missing Values
Now let’s look at the number of missing values in our data.
#> KontakID JoinDate GROUP SM CompletedDate PurchasedDate
#> 8 8 26 8 8
#> IDTransaksi Source AmountwithPPn Qty LOB
#> 8 8 8 8 8
#> Brand Subbrand Product SKUProduct Gender
#> 8 8 8 8 21
#> Age2 Occupation District City Province
#> 8 2923 999 44 8
Performs handling missing values by taking only rows that are not missing.
Check again whether there are still missing rows in the data
#> [1] FALSE
Duplicated Data
First of all, we’ll check for duplicate data. This can be done with
the duplicated()
function. To see the number of duplicate
rows, we can wrap the duplicated()
function with the
sum()
function.
#> [1] 0
The 0 above indicates that there are no identical rows.
Data Discretization
A data may have one of several types, such as numeric or categorical. Numerical data discretization is one of the most influential data processing steps in data mining. The goal of data discretization is to find a concise representation of the data in the form of categories that are adequate for analysis while retaining as much information as possible. An example of data discretization can be seen in Figure below, where changes are made by discretizing set values from numerical form into categories with certain intervals.
Furthermore, we will discretizing Age2
variables.
convert_age <- function(y){
if(y <= 19)
{y <- "<20 tahun"}
else if(y > 20 & y <= 35)
{y <- "20-35 tahun" }
else if(y > 35 & y <= 50)
{y <- "36-50 tahun" }
else if(y > 50 & y <= 65)
{y <- "51-65 tahun"}
else
{y <- "65 tahun"}
}
#> [1] "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun" "<20 tahun"
Working with Lubridate
Taking a quick peek at the JoinDate
,
CompletedDate
, and PurchasedDate
columns
reveals that the date values are stored in a year-month-day format, with
- (dash) being the delimiter:
#> [1] "2014-04-05 UTC" "2011-10-04 UTC" "2014-04-05 UTC" "2014-04-05 UTC"
#> [5] "2014-04-05 UTC" "2020-05-11 UTC"
#> [1] "2021-02-03 UTC" "2021-01-14 UTC" "2021-10-22 UTC" "2021-02-25 UTC"
#> [5] "2021-12-11 UTC" "2021-04-20 UTC"
#> [1] "2021-02-02 UTC" "2021-01-13 UTC" "2021-10-13 UTC" "2021-02-24 UTC"
#> [5] "2021-12-09 UTC" "2021-04-19 UTC"
Time data manipulation using the lubridate
library
A brief use of lubridate can be seen in the
cheatsheet
.
JoinDate
Column
#> [1] "2014-04-05" "2011-10-04" "2014-04-05" "2014-04-05" "2014-04-05"
#> [6] "2020-05-11"
We can change the timezone List of time zones: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
We change the timezone to Jakarta
#> [1] "2014-04-05 WIB" "2011-10-04 WIB" "2014-04-05 WIB" "2014-04-05 WIB"
#> [5] "2014-04-05 WIB" "2020-05-11 WIB"
However, since our data is of type YYYY-MM-DD the timezone change isn’t too significant unless the data type contains time.
CompletedDate
Column
#> [1] "Date"
PurchasedDate
Column
#> [1] "POSIXct" "POSIXt"
PurchasedDate
is a column that contains when a customer
buys a product. Therefore, it is far more insightful if we can extract
some of the components in it.
Task 1: Extract Year from
PurchasedDate
#> [1] 2021 2021 2021 2021 2021 2021
Task 2: Extract Month from
PurchasedDate
sales_clean$month_purchased <- month(sales_clean$PurchasedDate, label = T, abbr = F)
head(sales_clean$month_purchased)
#> [1] February January October February December April
#> 12 Levels: January < February < March < April < May < June < ... < December
Task 3: Extract day name from
PurchasedDate
sales_clean$day_purchased <- wday(sales_clean$PurchasedDate, label = T, abbr = F)
head(sales_clean$day_purchased)
#> [1] Tuesday Wednesday Wednesday Wednesday Thursday Monday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
Task 4: Create a column containing how long the customer has been from the date he joined to his last purchase
For advanced analysis, we will only use purchase data when that
purchase last occurred to reduce data redundancies. For that we can
use unique()
and match()
.
Data Exploration
Customer Profile Analysis
Sebelum membuat visualisasi, penting untuk melakukan aggregasi data yang tepat. Akan coba dibuat persebaran jenis kelamin yang berbelanja di beberapa channel pembelian susu.
Gender
Mari kita buat dataframe dari tabel frekuensi dari
Gender
#>
#> Laki-laki Laki-Laki Perempuan PEREMPUAN
#> 113 0 3078 0
Dapat dilihat bahwa terdapat beberapa kategori yang redundan, hal ini akan coba ditakeout dari dataframe
gender_freq <- as.data.frame(table(sales_clean$Gender))
gender_freq <- gender_freq[gender_freq$Var1 %in% c("Laki-laki","Perempuan"), ]
gender_freq
ggplot(data = gender_freq, mapping = aes(x = Var1, y = Freq)) +
geom_col(fill = "pink") +
geom_label(aes(label = Freq), col = "black") +
labs(title = "Demografi Usia pelanggan Susu", subtitle = "Pada Channel E-Commerce di Indonesia", x = "Gender", y = "Total") +
theme_minimal()
gender_freq2 <- as.data.frame(table(sales_clean$Gender, sales_clean$Brand))
gender_freq2 <- gender_freq2[gender_freq2$Var1 %in% c("Laki-laki","Perempuan"), ]
gender_freq2
ggplot(data = gender_freq2, mapping = aes(x = Var1, y = Freq, fill = Var2)) +
geom_col()+
geom_label(aes(label = Freq), col = "black") +
labs(title = "Demografi Usia pelanggan Susu", subtitle = "Pada Channel E-Commerce di Indonesia", x = "Gender", y = "Total") +
facet_wrap(~ Var2, ncol = 2) +
scale_fill_manual(values=c('tomato', 'red')) +
theme(legend.position = 'none')
Age
ggplot(data = age_freq, mapping = aes(x = Var1, y = Freq)) +
geom_col(fill = "tomato") +
labs(title = "Age Customer Distribution",
subtitle = "From e-commerce Channel",
x = "Rentang usia",
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(colour = "darkgreen", hjust = 0.5, face = "bold", size = 20),
plot.subtitle = element_text(hjust = 0.5))
District
Top 10 District from Valuable Customer
district_freq <- as.data.frame(table(sales_clean$District))
# ordering
district_freq <- district_freq[order(district_freq$Freq, decreasing = T), ]
names(district_freq) <- c("District", "Freq")
district_freq
Membuat Visualisasi
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen")
Ingin difokuskan untuk District Kembangan saja, sehingga perlu dihighligt dengan warna yang berbeda
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,])
Tambahkan label hanya untuk Top 3 District teratas
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,]) +
geom_label(data = district_freq[1:3,],
aes(label = Freq), nudge_x = 2)
Tambahkan garis yang menunjukan nilai rata-rata dari keseluruhan Frekuensi, kemudian berikan label berisikan nilai rata-rata tersebut.
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,]) +
geom_label(data = district_freq[1:3,],
aes(label = Freq), nudge_x = 2) +
geom_vline(xintercept = mean(district_freq$Freq),
linetype = 2,
lwd = 1,
color = "tomato") + # memberikan garis mean
geom_label(aes(label = round(mean(district_freq$Freq),2),
x = 19,
y = "DUREN SAWIT"),
fill = "tomato") # memberikan label mean
Menambahkan ticks mark pada sumbu x
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,]) +
geom_label(data = district_freq[1:3,],
aes(label = Freq), nudge_x = 2) +
geom_vline(xintercept = mean(district_freq$Freq),
linetype = 2,
lwd = 1,
color = "tomato") + # memberikan garis mean
geom_label(aes(label = round(mean(district_freq$Freq),2),
x = 19,
y = "DUREN SAWIT"),
fill = "tomato") + # memberikan label mean
scale_x_continuous(breaks = seq(0, 140, 20))
Percantik plot dengan labs()
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,]) +
geom_label(data = district_freq[1:3,],
aes(label = Freq), nudge_x = 2) +
geom_vline(xintercept = mean(district_freq$Freq),
linetype = 2,
lwd = 1,
color = "tomato") + # memberikan garis mean
geom_label(aes(label = round(mean(district_freq$Freq),2),
x = 19,
y = "DUREN SAWIT"),
fill = "tomato")+ # memberikan label mean
scale_x_continuous(breaks = seq(0, 140, 20)) +
labs(title = "Top 10 District",
subtitle = "Based on Customer Frequency",
x = "Frequency",
y = NULL)
Percantik plot dengan theme()
ggplot(data = district_freq[1:10, ],
aes(x = Freq, y = reorder(District, Freq))) +
geom_col(fill = "darkseagreen") +
geom_col(fill = "darkgreen", data = district_freq[2,]) +
geom_label(data = district_freq[1:3,],
aes(label = Freq), nudge_x = 2) +
geom_vline(aes(xintercept = mean(district_freq$Freq)),
linetype = 2,
lwd = 1,
color = "tomato") + # memberikan garis mean
geom_label(aes(label = round(mean(district_freq$Freq),2),
x = 19,
y = "DUREN SAWIT"),
fill = "tomato")+ # memberikan label mean
scale_x_continuous(breaks = seq(0, 140, 20)) +
labs(title = "Top 10 District",
subtitle = "Based on Customer Frequency",
x = "Frequency",
y = NULL) +
theme_minimal() +
theme(axis.text.y = element_text(face = "bold", color = "dimgrey"),
plot.title = element_text(face = "bold", size = 15))
Province
province_freq <- as.data.frame(table(sales_clean$Province))
# Diurutkan
province_freq <- province_freq[order(province_freq$Freq, decreasing = T), ]
# Melakukan pengubahan nama kolom
names(province_freq) <- c("Province", "Freq")
province_freq
ggplot(province_freq, aes(x = reorder(Province, -Freq), y = Freq)) +
geom_bar(stat = "identity", aes(fill = Province), show.legend = F, width = 0.6) +
scale_fill_brewer(palette = 5) +
geom_text(aes(label = Freq), nudge_y = 50) +
labs(title = "Province Customer Distribution",
subtitle = "from e-commerce channel",
y = NULL,
x = "Province") +
theme_minimal() +
theme(plot.title = element_text(face = "bold")
)
Business Question: Ingin diketahui distribusi nilai
AmountwithPPn
< 2000000 dan > 1000000 untuk
masing-masing Province
menggunakan boxplot.
province_agg <- sales_clean[sales_clean$AmountwithPPn <= 2000000 & sales_clean$AmountwithPPn >= 1000000,]
head(province_agg)
Visualisasi boxplot dan jitter yang berisikan nilai
Qty
ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
geom_boxplot(outlier.color = "red") +
geom_jitter(aes(size = Qty))
Memberikan warna
ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
geom_jitter(aes(size = Qty, color = Province)) +
scale_color_brewer(palette = 8) +
theme_minimal()
Mempercantik Plot
ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
geom_jitter(aes(size = Qty, color = Province)) +
scale_color_brewer(palette = 8) +
labs(title = "Boxplot Distribution of Province",
subtitle = "Based on Amount With PPN & Quantity",
x = NULL,
y = "Amount with PPN (in Rupiah)",
size = "Quantity") +
theme_minimal()
Menambahkan koma pada axis y untuk satuan Rupiah
library(scales)
ggplot(data = province_agg, aes(x = Province, y = AmountwithPPn)) +
geom_boxplot(outlier.shape = NA, fill = "khaki", color = "darkgrey") +
geom_jitter(aes(size = Qty, color = Province)) +
scale_color_brewer(palette = 8) +
scale_y_continuous(labels = comma) +
labs(title = "Boxplot Distribution of Province",
subtitle = "Based on Amount With PPN & Quantity",
x = NULL,
y = "Amount with PPN (in Rupiah)",
size = "Quantity") +
theme_minimal()
Insight:
- Jawa Barat merupakan provinsi dengan jumlah Amount with PPN yang paling bervariatif di antara lainnya.
- Berdasarkan nilai median, Banten merupakan Provinsi dengan Amount with PPN tertinggi
Occupation
Top 10 Occupation
- Persiapkan datanya
occ_freq <- as.data.frame(table(sales_clean$Occupation))
occ_freq <- occ_freq[order(occ_freq$Freq, decreasing = T), ]
- Top 10 Occupation
- Visualisasi
ggplot(data = occ_freq_5, mapping = aes(x = Freq, y = reorder(Occupation, Freq))) +
geom_col(aes(fill = Freq), show.legend = F) +
scale_fill_gradient(low = "lightsalmon", high = "darkred") +
labs(title = "Customer Occupation",
subtitle = "from e-commerce channel",
x = "Frequency",
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.4),
plot.subtitle = element_text(hjust = 0.4))
Apabila ingin ditampilkan nilai proporsi pada visualisasi
occ_prop <- as.data.frame(prop.table(table(sales_clean$Occupation)))
occ_prop <- occ_prop[order(occ_prop$Freq, decreasing = T), ]
occ_prop_10 <- head(occ_prop, 10)
names(occ_prop_10) <- c("Occupation", "Prop")
occ_prop_10
ggplot(data = occ_prop_10, mapping = aes(x = Prop, y = reorder(Occupation, Prop))) +
geom_col(aes(fill = Prop), show.legend = F) +
scale_fill_gradient(low = "lightsalmon", high = "darkred") +
geom_label(aes(label = scales::percent(round(Prop, 4))), size = 3, nudge_x = 0.02) +
scale_x_continuous(labels = scales::percent) + # menambahkan label pada axis
labs(title = "Customer Occupation Percentage",
subtitle = "from e-commerce channel",
x = "Percentage",
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.4),
plot.subtitle = element_text(hjust = 0.4))
Source
Top 5 Source Detail
source_freq <- as.data.frame(table(sales_clean$Source))
source_freq <- source_freq[order(source_freq$Freq, decreasing = T), ]
source_freq <- head(source_freq, 5)
names(source_freq) <- c("Source", "Freq")
source_freq
Visualization
ggplot(data = source_freq, mapping = aes(x = Freq, y = reorder(Source, Freq))) +
geom_col(aes(fill = Freq), show.legend = F) +
scale_fill_gradient(low = "plum", high = "plum4") +
geom_text(aes(label = Freq), nudge_x = 80) +
labs(title = "Customer by Source",
subtitle = "from e-commerce channel",
x = "Frequency",
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.4),
plot.subtitle = element_text(hjust = 0.4))
Customer Revenue Analysis
By analyzing customer revenue patterns, businesses can make informed decisions, enhance customer satisfaction, and drive long-term growth. This report aims to provide a comprehensive overview of customer revenue analysis, its significance, methodologies, and the actionable insights it can offer to organizations
Monthly Trend Sales
Akan dilihat bagaimana median trend Sales setiap bulan dan tahun
hanya untuk tahun 2020 dan 2021 berdasarkan
AmountwithPPn
agg_sales <- aggregate(AmountwithPPn ~ year_purchased + month_purchased,
data = sales_clean,
FUN = median)
agg_sales <- agg_sales[agg_sales$year_purchased %in% c(2020, 2021), ]
agg_sales$year_purchased <- as.factor(agg_sales$year_purchased)
Visualisasi dengan geom_line
ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
geom_point(aes(color = year_purchased), size = 2) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
scale_color_brewer(palette = "Set2")
Hanya ingin diberikan label untuk bulan july
ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
geom_point(aes(color = year_purchased), size = 2) +
geom_label(data = agg_sales[agg_sales$month_purchased == 'July', ],
aes(label = scales::comma(round(AmountwithPPn, 1))),
size = 2,
nudge_y = 10000) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
scale_y_continuous(labels = scales::comma) +
scale_color_brewer(palette = "Set2")
Percantik plot
ggplot(data = agg_sales, aes(x = month_purchased, y = AmountwithPPn)) +
geom_line(aes(group = year_purchased, color = year_purchased), size = 1) +
geom_point(aes(color = year_purchased), size = 2) +
geom_label(data = agg_sales[agg_sales$month_purchased == 'July', ],
aes(label = scales::comma(round(AmountwithPPn, 1))),
size = 2,
nudge_y = 10000) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
scale_y_continuous(labels = scales::comma) +
scale_color_brewer(palette = "Set2") +
labs(title = "Monthly Trend Sales Revenue",
subtitle = "in 2020 and 2021",
y = "Month",
x = "Amount of Sales",
color = "Year") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5))
Contribution Sales by Source (in Percentage)
Step by Step:
- Persiapkan Data
source_agg <- aggregate(AmountwithPPn ~ Source,
data = sales_clean,
FUN = sum)
source_agg <- source_agg[order(source_agg$AmountwithPPn, decreasing = T), ]
source_agg$percentage <- source_agg$AmountwithPPn/sum(source_agg$AmountwithPPn)
source_agg
ggplot(data = source_agg, aes(x = percentage, y = reorder(Source, percentage))) +
geom_col(aes(fill = percentage), show.legend = F) +
geom_label(aes(label = scales::percent(round(percentage,4))), size = 3, nudge_x = 0.05) +
scale_fill_gradient(low = "peru", high = "moccasin") +
scale_x_continuous(label = scales::percent) +
labs(title = "Contribution Sales by Source",
subtitle = "in percentage (%)",
y = "Source") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.35, face = "bold"),
plot.subtitle = element_text(hjust = 0.35))
Contribution Sales by Product
product_agg <- aggregate(AmountwithPPn ~ Product + Brand,
data = sales_clean,
FUN = sum)
product_agg <- product_agg[order(product_agg$AmountwithPPn, decreasing = T), ]
product_agg
Membuat multivariate plot
library(scales)
ggplot(data = product_agg, mapping = aes(y = reorder(Product,AmountwithPPn), x = AmountwithPPn)) +
geom_col(aes(fill = Brand), show.legend = F) +
geom_text(aes(label = comma(AmountwithPPn)), size = 2) +
facet_wrap(~Brand, ncol = 2, scales = "free") +
# untuk mengatur wraping nilai agar tidak terlalu panjang
scale_y_discrete(label = label_wrap(20)) +
# untuk mengatur agar skala x ada koma
scale_x_continuous(label = comma) +
# untuk mengatur ukuran axis x
theme(axis.text.y = element_text(size = 7),
axis.text.x = element_text(size = 5)) +
labs(title = "Contribution Sales by Product",
subtitle = "in Total",
y = "Product Name",
x = NULL)
Contribution Sales by City
Kita akan coba melakukan agregasi data dengan menampilkan nilai rata-rata dari sales/revenue untuk province jakarta saja.
# melakukan filtering hanya untuk Province DKI Jakarta
city_jakarta <- sales_clean[sales_clean$Province == "DKI JAKARTA", ]
# Melakukan Aggregate dan pembentukan percent
city_agg1 <- aggregate(AmountwithPPn ~ City + Brand,
data = city_jakarta,
FUN = mean)
city_agg1 <- transform(city_agg1,
perc = ave(AmountwithPPn,
City,
FUN = prop.table))
city_agg1
In Percentage Multivariate Plot
ggplot(data = city_agg1, mapping = aes(x = AmountwithPPn, y = City))+
geom_col(aes(group = Brand, fill = Brand), position = position_fill(reverse = T)) +
scale_x_continuous(label = percent) +
geom_text(aes(label = percent(perc)), position = position_fill(vjust = 0.5), size = 3.1) +
scale_fill_brewer(palette = "Set3") +
theme_minimal()
Contribution Sales by Province
# Melakukan Aggregate dan pembentukan percent
prov_agg1 <- aggregate(AmountwithPPn ~ Province + Brand,
data = sales_clean,
FUN = mean)
prov_agg1
ggplot(data = prov_agg1, aes(x = Province, y = AmountwithPPn, fill = Brand)) +
geom_col(aes(group = Brand), position = "dodge") +
geom_text(aes(label = comma(x = AmountwithPPn, prefix = "Rp.", suffix = ",-")),
position = position_dodge(0.9), vjust = -0.5, size = 2.9) +
scale_y_continuous(labels = comma, breaks = seq(0,600000,100000), limits = c(0, 500000)) +
theme_minimal()
Spatial analysis using leaflet
city_all <- aggregate(AmountwithPPn ~ City,
data = sales_clean,
FUN = mean)
city_all$percent <- city_all$AmountwithPPn/sum(city_all$AmountwithPPn)
city_all
# menambahkan data latitude_longitude
latlong <- data.frame("lat" = c(-6.9147444, -6.8937121, -6.233333, -6.6, -7.3333333, -6.8172531, -6.0169825, -6.880239, -6.715534, -6.39, -7.227906, -6.336315, -6.211544, -6.211544, -6.332973, -6.211544, -6.211544, -6.3227303, -6.9833333, -6.8531026, -6.314835, -6.5386806, -6.12009, -6.569361, -6.92405, -6.8329, -6.1783056, -6.2888889, -7.327954),
"long" = c(107.6098111, 107.4321959, 107, 106.8, 108.35, 107.1307289, 106.040506, 107.5355, 108.564003, 106.83, 107.908699, 108.325104, 106.845172, 106.845172, 106.807915, 106.845172, 106.845172, 107.3375791, 108.4833333, 108.2258897, 106.103897, 107.4499404, 106.150299, 107.752403, 106.922203, 107.9532, 106.6318889, 106.7180556, 108.214104))
library(maps)
global <- map_data("world")
world <- ggplot() +
geom_polygon(data = global, aes(x=long, y = lat, group = group),
fill = "gray85", color = "gray80") +
coord_fixed(1.3)
Indo <- world + xlim(94,142) + ylim(-11,7.5)
library(ggrepel)
world + xlim(105,115) + ylim(-9,-5.8)+
geom_point(data = city_all_clean, aes(x = long, y = lat, size = AmountwithPPn, color = AmountwithPPn),
alpha = 0.5,
show.legend = F) +
scale_color_gradient(low = "yellow", high = "red") +
geom_text_repel(data = city_all_clean, aes(x = long, y = lat, label= City), size = 2)
library(htmltools)
library(dplyr)
library(leaflet)
m <- leaflet(city_all_clean)
pop <- paste("Region: ", city_all_clean$City, "<br/>",
"Revenue: ", round(city_all_clean$AmountwithPPn)) %>%
lapply(htmltools::HTML)
# mendefinisikan variabel yang menjadi dasar pewarnaan
col <- city_all_clean$AmountwithPPn
mybins <- seq(50000, 700000, by=150000)
pal <- colorBin( palette="YlOrRd",
domain=col,
bins=mybins)
m %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addCircleMarkers(~long, ~lat,
fillColor = ~pal(col),
weight = 1,
color = "white",
fillOpacity = 0.8,
radius=8,
stroke=FALSE,
label = pop,) %>%
addLegend("bottomright",
pal = pal,
values = ~col,
title = "Revenue",
labFormat = labelFormat(digits = 2),
opacity = 1)