Quantium Virtual Internship - Retail Strategy and Analytics Project

Yong Jiang 03/29/2022

Data Preparation

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(readr)
library(readxl)
library(data.table)
## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
purchase_behavior <- read_csv("QVI_purchase_behaviour.csv")
## Rows: 72637 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): LIFESTAGE, PREMIUM_CUSTOMER
## dbl (1): LYLTY_CARD_NBR
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
transaction_data <- read_excel("QVI_transaction_data.xlsx")

Exam, clean and summary the data

colnames(purchase_behavior)
## [1] "LYLTY_CARD_NBR"   "LIFESTAGE"        "PREMIUM_CUSTOMER"
colnames(transaction_data)
## [1] "DATE"           "STORE_NBR"      "LYLTY_CARD_NBR" "TXN_ID"        
## [5] "PROD_NBR"       "PROD_NAME"      "PROD_QTY"       "TOT_SALES"
str(purchase_behavior)
## spec_tbl_df [72,637 x 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ LYLTY_CARD_NBR  : num [1:72637] 1000 1002 1003 1004 1005 ...
##  $ LIFESTAGE       : chr [1:72637] "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
##  $ PREMIUM_CUSTOMER: chr [1:72637] "Premium" "Mainstream" "Budget" "Mainstream" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   LYLTY_CARD_NBR = col_double(),
##   ..   LIFESTAGE = col_character(),
##   ..   PREMIUM_CUSTOMER = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(transaction_data)
## tibble [264,836 x 8] (S3: tbl_df/tbl/data.frame)
##  $ DATE          : num [1:264836] 43390 43599 43605 43329 43330 ...
##  $ STORE_NBR     : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID        : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR      : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
head(purchase_behavior)
## # A tibble: 6 x 3
##   LYLTY_CARD_NBR LIFESTAGE              PREMIUM_CUSTOMER
##            <dbl> <chr>                  <chr>           
## 1           1000 YOUNG SINGLES/COUPLES  Premium         
## 2           1002 YOUNG SINGLES/COUPLES  Mainstream      
## 3           1003 YOUNG FAMILIES         Budget          
## 4           1004 OLDER SINGLES/COUPLES  Mainstream      
## 5           1005 MIDAGE SINGLES/COUPLES Mainstream      
## 6           1007 YOUNG SINGLES/COUPLES  Budget
head(transaction_data)
## # A tibble: 6 x 8
##    DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME    PROD_QTY TOT_SALES
##   <dbl>     <dbl>          <dbl>  <dbl>    <dbl> <chr>           <dbl>     <dbl>
## 1 43390         1           1000      1        5 Natural Chi~        2       6  
## 2 43599         1           1307    348       66 CCs Nacho C~        3       6.3
## 3 43605         1           1343    383       61 Smiths Crin~        2       2.9
## 4 43329         2           2373    974       69 Smiths Chip~        5      15  
## 5 43330         2           2426   1038      108 Kettle Tort~        3      13.8
## 6 43604         4           4074   2982       57 Old El Paso~        1       5.1
transaction_data$DATE <- as.Date(transaction_data$DATE, origin = "1899-12-30")
head(transaction_data)
## # A tibble: 6 x 8
##   DATE       STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME         PROD_QTY
##   <date>         <dbl>          <dbl>  <dbl>    <dbl> <chr>                <dbl>
## 1 2018-10-17         1           1000      1        5 Natural Chip    ~        2
## 2 2019-05-14         1           1307    348       66 CCs Nacho Cheese~        3
## 3 2019-05-20         1           1343    383       61 Smiths Crinkle C~        2
## 4 2018-08-17         2           2373    974       69 Smiths Chip Thin~        5
## 5 2018-08-18         2           2426   1038      108 Kettle Tortilla ~        3
## 6 2019-05-19         4           4074   2982       57 Old El Paso Sals~        1
## # ... with 1 more variable: TOT_SALES <dbl>
sum(purchase_behavior$LYLTY_CARD_NBR == '')
## [1] 0
sum(purchase_behavior$LIFESTAGE == '')
## [1] 0
sum(purchase_behavior$PREMIUM_CUSTOMER == '')
## [1] 0
sum(transaction_data$DATE == '')
## [1] NA
sum(transaction_data$STORE_NBR == '')
## [1] 0
sum(transaction_data$LYLTY_CARD_NBR == '')
## [1] 0
sum(transaction_data$TXN_ID == '')
## [1] 0
sum(transaction_data$PROD_NBR == '')
## [1] 0
sum(transaction_data$PROD_NAME == '')
## [1] 0
sum(transaction_data$PROD_QTY == '')
## [1] 0
sum(transaction_data$TOT_SALES == '')
## [1] 0

There is no empty cell from both data sets, but the date column has N/A.

Exam Transaction Data

#### product name percentage
PN <- transaction_data %>% 
  group_by(PROD_NAME) %>% 
  summarise(cnt = n()) %>% 
  mutate(freq = round(cnt/sum(cnt), 3)) %>% 
  arrange(desc(freq))
View(PN)
#### product quantity percentage
PQ <- transaction_data %>% 
  group_by(PROD_QTY) %>% 
  summarise(cnt = n()) %>% 
  mutate(freq = round(cnt/sum(cnt), 3)) %>% 
  arrange(desc(freq))
View(PQ)

There are two transactions that customer bought 200, they are outliers that need to be cleaned.

transaction_data_cleaned <- transaction_data %>% 
  filter(LYLTY_CARD_NBR != 226000)
View(transaction_data_cleaned)
transaction_data_cleaned %>% summarise_if(is.numeric, max)
## # A tibble: 1 x 6
##   STORE_NBR LYLTY_CARD_NBR  TXN_ID PROD_NBR PROD_QTY TOT_SALES
##       <dbl>          <dbl>   <dbl>    <dbl>    <dbl>     <dbl>
## 1       272        2373711 2415841      114        5      29.5
transaction_data_cleaned %>% summarise_if(is.numeric, min)
## # A tibble: 1 x 6
##   STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
##       <dbl>          <dbl>  <dbl>    <dbl>    <dbl>     <dbl>
## 1         1           1000      1        1        1       1.5
transaction_data_cleaned %>% summarise_if(is.numeric, mean)
## # A tibble: 1 x 6
##   STORE_NBR LYLTY_CARD_NBR  TXN_ID PROD_NBR PROD_QTY TOT_SALES
##       <dbl>          <dbl>   <dbl>    <dbl>    <dbl>     <dbl>
## 1      135.        135549. 135158.     56.6     1.91      7.30
transaction_data_cleaned %>% summarise_if(is.numeric, median)
## # A tibble: 1 x 6
##   STORE_NBR LYLTY_CARD_NBR  TXN_ID PROD_NBR PROD_QTY TOT_SALES
##       <dbl>          <dbl>   <dbl>    <dbl>    <dbl>     <dbl>
## 1       130         130357 135136.       56        2       7.4
product_name <- transaction_data_cleaned %>% 
  group_by(PROD_NAME) %>% 
  summarise(count = n())
View(product_name)

Need to double confirm the definition of chip with Category Manager for Chips, for example, is the puff product in it? For now, I will exclude salsa only.

transaction_data_cleaned %>% filter(!grepl("Salsa", PROD_NAME))
## # A tibble: 246,740 x 8
##    DATE       STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME        PROD_QTY
##    <date>         <dbl>          <dbl>  <dbl>    <dbl> <chr>               <dbl>
##  1 2018-10-17         1           1000      1        5 Natural Chip   ~        2
##  2 2019-05-14         1           1307    348       66 CCs Nacho Chees~        3
##  3 2019-05-20         1           1343    383       61 Smiths Crinkle ~        2
##  4 2018-08-17         2           2373    974       69 Smiths Chip Thi~        5
##  5 2018-08-18         2           2426   1038      108 Kettle Tortilla~        3
##  6 2019-05-16         4           4149   3333       16 Smiths Crinkle ~        1
##  7 2019-05-16         4           4196   3539       24 Grain Waves    ~        1
##  8 2018-08-20         5           5026   4525       42 Doritos Corn Ch~        1
##  9 2018-08-18         7           7150   6900       52 Grain Waves Sou~        2
## 10 2019-05-17         7           7215   7176       16 Smiths Crinkle ~        1
## # ... with 246,730 more rows, and 1 more variable: TOT_SALES <dbl>
product_name <- transaction_data_cleaned %>% 
  group_by(PROD_NAME) %>% 
  summarise(count = n())
View(product_name)
transaction_data_cleaned$PACK_SIZE <- parse_number(transaction_data_cleaned$PROD_NAME)
max(transaction_data_cleaned$PACK_SIZE)
## [1] 380
min(transaction_data_cleaned$PACK_SIZE)
## [1] 70
pack_size_transaction <- transaction_data_cleaned %>% 
  group_by(PACK_SIZE) %>% 
  summarise(count = n())
ggplot(transaction_data_cleaned, aes(factor(PACK_SIZE))) + 
  geom_bar(color="black", fill="yellow") +
  labs(x = "Pack Size", y = "Number of transactions", title = "Transactions by Pack Size")

The top 5 best seller sizes are 175g, 150g, 134g 110g, and 170g.

transaction_data_cleaned$BrandName <- word(transaction_data_cleaned$PROD_NAME, 1)  
BrandName_count <- transaction_data_cleaned %>% 
  group_by(BrandName) %>% 
  summarise(count = n())

Some brand names are the same brands but the spells are different. We can convert same brands into the same spells.

transaction_data_cleaned$BrandName[transaction_data_cleaned$BrandName == "Dorito"] <- "Doritos"
transaction_data_cleaned$BrandName[transaction_data_cleaned$BrandName == "Red"] <- "RRD"
transaction_data_cleaned$BrandName[transaction_data_cleaned$BrandName == "Smith"] <- "Smiths"

BrandName_count <- transaction_data_cleaned %>% 
  group_by(BrandName) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))
BrandName_count <- transaction_data_cleaned %>% 
         group_by(BrandName) %>% 
         summarise(cnt = n()) %>% 
         arrange(desc(cnt)) %>% 
         ggplot(aes(x= reorder(BrandName, cnt), y = cnt)) +
         geom_col(color = "black", fill = "yellow")+
         labs(title = "Transaction by Brand Name", y = "Count", x = "Brand Name") + 
         scale_y_continuous(labels = scales::comma) +
         coord_flip() +
         theme_minimal()
print(BrandName_count)

5 top popular brands are Kettle, Smiths, Doritos, Pringles and RRD.

date_number <- transaction_data_cleaned %>% 
  group_by(DATE) %>% 
  summarise(count = n())
print(date_number)
## # A tibble: 364 x 2
##    DATE       count
##    <date>     <int>
##  1 2018-07-01   724
##  2 2018-07-02   711
##  3 2018-07-03   722
##  4 2018-07-04   714
##  5 2018-07-05   712
##  6 2018-07-06   762
##  7 2018-07-07   750
##  8 2018-07-08   696
##  9 2018-07-09   749
## 10 2018-07-10   705
## # ... with 354 more rows

There are 364 rows in total while there should be 365 days from 2018/07/01 to 2019/06/30.

### Fulfill the date
start_date <- ymd("2018/07/01")
end_date <- ymd("2019/06/30")
range_date <- seq(start_date, by = "day", length.out = 365)
date_2018_2019 <- data.frame(DATE = range_date)
View(date_2018_2019)
transaction_data_2018_2019 <- full_join(date_2018_2019, transaction_data_cleaned)
## Joining, by = "DATE"
### check 
date_number <- transaction_data_2018_2019 %>% 
  group_by(DATE) %>% 
  summarise(count = n())
print(date_number)
## # A tibble: 365 x 2
##    DATE       count
##    <date>     <int>
##  1 2018-07-01   724
##  2 2018-07-02   711
##  3 2018-07-03   722
##  4 2018-07-04   714
##  5 2018-07-05   712
##  6 2018-07-06   762
##  7 2018-07-07   750
##  8 2018-07-08   696
##  9 2018-07-09   749
## 10 2018-07-10   705
## # ... with 355 more rows
ggplot(date_number, aes(x = DATE, y = count)) + 
    geom_line() + 
    labs(x = "Day", y = "Number of transactions", title = "Transactions over time") + 
  scale_x_date(breaks = "1 month") + theme(axis.text.x = element_text(angle = 65, vjust = 0.5))

There are big changes in Dec 2018, let’s take a close look.

Dec_2018 <- transaction_data_2018_2019 %>% 
  filter(between(DATE, as.Date("2018-12-01"), as.Date("2018-12-31")))
View(Dec_2018)
transaction_data_Dec_2018 <- Dec_2018 %>% 
  group_by(DATE) %>% 
  summarise(count = n())

ggplot(transaction_data_Dec_2018, aes(x = DATE, y = count)) + 
    geom_line() + 
    labs(x = "Day", y = "Number of transactions", title = "Transactions over time") + 
  scale_x_date(breaks = "1 day") + theme(axis.text.x = element_text(angle = 65, vjust = 0.5))

The missing date with no transaction is Dec-24, the Christmas eve that no shops are open. The increased sales on Dec may because of the Christmas holiday sales.

Exam Purchase Behavior Data

LP <- purchase_behavior %>%  
  group_by(LIFESTAGE) %>%  
  summarise(cnt = n()) %>% 
  mutate(freq = round(cnt/sum(cnt), 3)) %>%  
  arrange(desc(freq))
View(LP)
blank_theme <- theme_minimal() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold")
  )

ggplot(LP, aes(x = "", y = cnt, fill = LIFESTAGE)) + 
  geom_bar(width = 1, stat = "identity") + 
  coord_polar("y", start = 0) + scale_fill_brewer("") +
  blank_theme +
  theme(axis.text.x = element_blank())

PC <- purchase_behavior %>% 
  group_by(PREMIUM_CUSTOMER) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count)) 
  
ggplot(PC, aes(x= PREMIUM_CUSTOMER, y = count)) +
  geom_bar(stat = "identity", color = "Black", fill = "Yellow")+
  labs(title = "Transaction by Premium Customer", x = "Premium Customer") + 
  geom_text(aes(label = count), size = 3, position = position_stack(vjust = 1.05))

Merge Two Data Sets

QVI_data <- merge(transaction_data_2018_2019, purchase_behavior, all.x = TRUE)
QVI_data$CUSTOMER_SEGMENT <- paste(QVI_data$PREMIUM_CUSTOMER, QVI_data$LIFESTAGE)
ggplot(data = QVI_data) +
  geom_col(mapping = aes(x = BrandName, y = PROD_QTY, fill = BrandName)) + 
  labs(x = "Brand Name", y = "Sale Count", title = "Brand Preference by Customer Segment") +
  theme(axis.text.x=element_blank()) +
  facet_wrap(~CUSTOMER_SEGMENT)
## Warning: Removed 1 rows containing missing values (position_stack).

Too many different customer segments.

X <- QVI_data %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarise(
    cnt = n(),
    total_sale = sum(TOT_SALES),
    total_chip_amount = sum(PROD_QTY),
    sales_per_purchase = round(sum(TOT_SALES)/cnt, 3),
    average_chip_amt = round(sum(PROD_QTY)/cnt, 3),
    chip_unit_price = round(sum(TOT_SALES)/sum(PROD_QTY), 3)
     ) %>% 
   arrange(desc(total_sale))
## `summarise()` has grouped output by 'PREMIUM_CUSTOMER'. You can override using
## the `.groups` argument.

Top 3 sales come from Budget - Older families, Mainstream - Young singles/couples, and Mainstream - Retirees. Top 3 sales every purchase come from Mainstream - Midages singles/couples and Young singles/couples, and Premium - Retirees.
Top 3 chip sales quantity come from Older families. Top 3 chip unit price come from Mainstream - Young singles/couples, Midages singles/couples and New families

ggplot(data = X) +
  geom_col(mapping = aes(x = LIFESTAGE, y = total_sale, fill = LIFESTAGE)) + 
  labs(x = "Life Stage", title = "Total Sales by Customer Segment 1")+
  theme(axis.text.x=element_blank())+
  facet_wrap(~PREMIUM_CUSTOMER)
## Warning: Removed 1 rows containing missing values (position_stack).

ggplot(data = X) +
  geom_col(mapping = aes(x = PREMIUM_CUSTOMER, y = total_sale, fill = PREMIUM_CUSTOMER)) + 
  labs(x = "Purchase Behavior", title = "Total Sales by Customer Segment 2")+
  theme(axis.text.x=element_blank())+
  facet_wrap(~LIFESTAGE)
## Warning: Removed 1 rows containing missing values (position_stack).

Visualizations

X_1 <- QVI_data %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarise(
    cnt = n(),
    total_sale = sum(TOT_SALES)) 
## `summarise()` has grouped output by 'PREMIUM_CUSTOMER'. You can override using
## the `.groups` argument.
X_1$CUSTOMER_SEGMENT <- paste(X_1$PREMIUM_CUSTOMER, X_1$LIFESTAGE) 
x_1 <- X_1[, c("CUSTOMER_SEGMENT", "total_sale")] %>% 
  arrange(desc(total_sale)) %>% 
  top_n(3)
## Selecting by total_sale
ggplot(x_1, aes(x = reorder(CUSTOMER_SEGMENT, desc(total_sale)), y = total_sale)) +
  geom_col(color = "Black", fill = "Yellow", width = 0.75) +
  labs(x = "Customer Segment", y = "Total Sale", title = "Top 3 Total Sale by Customer Segment") + 
  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))

X_2 <- QVI_data %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarise(
    cnt = n(),
    sales_per_purchase = round(sum(TOT_SALES)/cnt, 3))
## `summarise()` has grouped output by 'PREMIUM_CUSTOMER'. You can override using
## the `.groups` argument.
X_2$CUSTOMER_SEGMENT <- paste(X_2$PREMIUM_CUSTOMER, X_2$LIFESTAGE) 
x_2 <- X_2[, c("CUSTOMER_SEGMENT", "sales_per_purchase")] %>% 
  arrange(desc(sales_per_purchase)) %>% 
  top_n(3)
## Selecting by sales_per_purchase
ggplot(x_2, aes(x = reorder(CUSTOMER_SEGMENT, desc(sales_per_purchase)), y = sales_per_purchase)) +
  geom_col(color = "Black", fill = "Yellow", width = 0.75) +
  labs(x = "Customer Segment", y = "Sales per Purchase", title = "Top 3 Sale per Purchase by Customer Segment") + 
  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))

X_3 <- QVI_data %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarise(
    cnt = n(),
    average_chip_amt = round(sum(PROD_QTY)/cnt, 3))
## `summarise()` has grouped output by 'PREMIUM_CUSTOMER'. You can override using
## the `.groups` argument.
X_3$CUSTOMER_SEGMENT <- paste(X_3$PREMIUM_CUSTOMER, X_3$LIFESTAGE) 
x_3 <- X_3[, c("CUSTOMER_SEGMENT", "average_chip_amt")] %>% 
  arrange(desc(average_chip_amt)) %>% 
  top_n(3)
## Selecting by average_chip_amt
ggplot(x_3, aes(x = reorder(CUSTOMER_SEGMENT, desc(average_chip_amt)), y = average_chip_amt)) +
  geom_col(color = "Black", fill = "Yellow", width = 0.75) +
  labs(x = "Customer Segment", y = "Chip Quantity", title = "Top 3 Average Chip Purchase Quantity by Customer Segment") + 
  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))

X_4 <- QVI_data %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarise(
    cnt = n(),
    chip_unit_price = round(sum(TOT_SALES)/sum(PROD_QTY), 3))
## `summarise()` has grouped output by 'PREMIUM_CUSTOMER'. You can override using
## the `.groups` argument.
X_4$CUSTOMER_SEGMENT <- paste(X_4$PREMIUM_CUSTOMER, X_4$LIFESTAGE) 
x_4 <- X_4[, c("CUSTOMER_SEGMENT", "chip_unit_price")] %>% 
  arrange(desc(chip_unit_price)) %>% 
  top_n(3)
## Selecting by chip_unit_price
ggplot(x_4, aes(x = reorder(CUSTOMER_SEGMENT, desc(chip_unit_price)), y = chip_unit_price)) +
  geom_col(color = "Black", fill = "Yellow", width = 0.75) +
  labs(x = "Customer Segment", y = "Unit Price", title = "Top 3 Unit Price by Customer Segment") + 
  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))

From conclusions above, these are seven customer segments that have more potential to purchase chips: Premium - Older families, Retirees Mainstream - Older families, Midages singles/couples, Young singles/couples, Retirees Budget - Older families

target_customer <- filter(QVI_data, CUSTOMER_SEGMENT %in% c("Premium OLDER FAMILIES", 
                                                            "Premium RETIREES", 
                                                            "Mainstream OLDER FAMILIES", 
                                                            "Mainstream MIDAGE SINGLES/COUPLES",
                                                            "Mainstream YOUNG SINGLES/COUPLES",
                                                            "Mainstream RETIREES",
                                                            "Budget OLDER FAMILIES"
                                                              ))

P <- target_customer %>% 
  group_by(CUSTOMER_SEGMENT, PACK_SIZE) %>% 
  summarise(cnt = n()) %>% 
  arrange(CUSTOMER_SEGMENT, desc(cnt)) 
## `summarise()` has grouped output by 'CUSTOMER_SEGMENT'. You can override using
## the `.groups` argument.
ggplot(P, aes(x= PACK_SIZE, y = cnt, fill = PACK_SIZE))+
  geom_col()+
  labs(x = "Pack Size", y = "Count", title = "Pack Size Preference by Target Customer Segment") + 
  facet_wrap(~CUSTOMER_SEGMENT)

  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 15
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Top 3 pack size preference for targeted customer segments: 175g, 150g, 134g

B <- target_customer %>% 
  group_by(CUSTOMER_SEGMENT, BrandName) %>% 
  summarise(cnt = n())
## `summarise()` has grouped output by 'CUSTOMER_SEGMENT'. You can override using
## the `.groups` argument.
ggplot(B, aes(x= BrandName, y = cnt, fill = BrandName))+
  geom_col()+
  labs(x = "BrandName", y = "Count", title = "Brand Name Preference by Target Customer Segment") + 
  facet_wrap(~CUSTOMER_SEGMENT)

  theme(axis.text.x = element_text(angle = 15, vjust = 0.5))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 15
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE