1. Load Libraries and data
#### Load libraries
library(ggplot2)
library(tidyverse)
library(tibble)
library(cowplot)
library(fitdistrplus)
library(data.table)
library(readr)
library(readxl)
library(stringr)
library(formatR)
library(ggmosaic)

#### Load dataset
transactionData <- read_excel("QVI_transaction_data.xlsx")
customerData <- read.csv("QVI_purchase_behaviour.csv")
  1. Transaction Analytics
#### 2. Examine transaction data
str(transactionData)
## tibble [264,836 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DATE          : num [1:264836] 43390 43599 43605 43329 43330 ...
##  $ STORE_NBR     : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
##  $ TXN_ID        : num [1:264836] 1 348 383 974 1038 ...
##  $ PROD_NBR      : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr [1:264836] "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
# Date is in integer format; change it to date
transactionData$DATE <- as.Date(transactionData$DATE, origin = "1899-12-30")

# Check that we are looking at the right products by examining PROD_NAME; 
# examine PROD_NAME
str(transactionData$PROD_NAME)
##  chr [1:264836] "Natural Chip        Compny SeaSalt175g" ...
transactionData %>% count(PROD_NAME, sort = TRUE) %>% unique()
# Looks like we are definitely looking at potato chips. But we want to be sure 
# these are all "chips"
# Perform some basic text analysis by summarizing the individual words in the 
# product name

#### Examine the words in PROD_NAME to see if there are any incorrect entries
#### such as products that are NOT chips
productWords <- strsplit(unique(as.character(transactionData$PROD_NAME)), " ") %>% 
  unlist() %>%
  as.data.frame()
setnames(productWords, "Words")

#### Remove digits by filter out all those rows that contain digits using 
# str_detect()
productWords <- productWords %>% filter(!(str_detect(Words, "\\d")))

#### Remove special characters
productWords <- productWords %>% filter(str_detect(Words, "[:alpha:]"))

# Let's look at the most common words by counting the number of times a word 
# appears and sorting them by this frequency in a descending order (highest to
# lowest)

#### Sort words from highest to lowest frequency
productWords %>% 
  group_by(Words) %>% summarize(Count = n()) %>% arrange(desc(Count))
# We might want to remove the Salsa because they're not considered chips

#### Remove Salsa products
transactionData <- transactionData %>% 
  dplyr::filter(!grepl('Salsa', PROD_NAME)) 

#### Check if any Salsa products are there
transactionData %>% 
  dplyr::filter(grepl('Salsa', PROD_NAME)) 
#### Let's check some transactions w.r.t. Loyalty Card data. 

# Unique Loyalty Cards
transactionData %>% dplyr::select(LYLTY_CARD_NBR) %>% distinct()
# Top 50 repeat customers
transactionData %>% group_by(LYLTY_CARD_NBR) %>% summarise(count = n()) %>% 
  filter(count > 1) %>% arrange(desc(count)) %>% print(n = 50)
## # A tibble: 51,757 × 2
##    LYLTY_CARD_NBR count
##             <dbl> <int>
##  1         162039    17
##  2         230078    17
##  3          23192    16
##  4         105026    16
##  5         113080    16
##  6         116181    16
##  7         128178    16
##  8         172032    16
##  9         179228    16
## 10          30162    15
## 11          94185    15
## 12         104117    15
## 13         107167    15
## 14         109036    15
## 15         112231    15
## 16         129050    15
## 17         138016    15
## 18         152054    15
## 19         156240    15
## 20         207003    15
## 21         213140    15
## 22         222242    15
## 23         259100    15
## 24           5168    14
## 25          13177    14
## 26          28020    14
## 27          32060    14
## 28          33073    14
## 29          43109    14
## 30          48155    14
## 31          57033    14
## 32          58361    14
## 33          62093    14
## 34          63197    14
## 35          71075    14
## 36         100130    14
## 37         104061    14
## 38         105016    14
## 39         109080    14
## 40         112168    14
## 41         112218    14
## 42         118124    14
## 43         129087    14
## 44         133138    14
## 45         138222    14
## 46         152094    14
## 47         156054    14
## 48         164034    14
## 49         179110    14
## 50         179213    14
## # … with 51,707 more rows
# Multiple Transaction ID's
transactionData %>% group_by(TXN_ID) %>% summarise(count = n()) %>% 
  filter(count > 1) %>% arrange(desc(count)) %>% print(n = 10)
## # A tibble: 1,479 × 2
##    TXN_ID count
##     <dbl> <int>
##  1   1162     3
##  2 102237     3
##  3 108462     3
##  4 211119     3
##  5 228741     3
##  6 230356     3
##  7    517     2
##  8    628     2
##  9   1142     2
## 10   1498     2
## # … with 1,469 more rows
# Check for the repeat transactions per Loyalty Card Number
transactionData %>% group_by(LYLTY_CARD_NBR, TXN_ID) %>% 
  summarize(count = n()) %>% 
  filter(count > 1) %>% arrange(desc(count))
#### Summarize the data to check for nulls and possible outliers
summary(transactionData)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135183  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135531   Mean   : 135131  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203084   3rd Qu.: 202654  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:246742      Min.   :  1.000   Min.   :  1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.800  
##  Median : 53.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.35                      Mean   :  1.908   Mean   :  7.321  
##  3rd Qu.: 87.00                      3rd Qu.:  2.000   3rd Qu.:  8.800  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000
# There's an anomaly where the product quantity is 200. Let's investigate
# this further
transactionData %>% filter(PROD_QTY == 200) 
# Same customer as identified by LYLTY_CARD_NBR 226000. Let's see if this 
# customer had any other transactions
transactionData %>% filter(LYLTY_CARD_NBR == 226000)
# This customer has had no other transactions recorded in the 12-month period, 
# and they've only purchased twice
# Both the times it was 200 packets of chips. This is an obvious anomaly and we can 
# remove this from the data

#### Filter the loyalty number causing the anomaly
transactionData <- transactionData %>% filter(LYLTY_CARD_NBR != 226000)

#### Re-examine the data
summary(transactionData)
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70015   1st Qu.:  67569  
##  Median :2018-12-30   Median :130.0   Median : 130367   Median : 135182  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135530   Mean   : 135130  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203083   3rd Qu.: 202652  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY       TOT_SALES     
##  Min.   :  1.00   Length:246740      Min.   :1.000   Min.   : 1.700  
##  1st Qu.: 26.00   Class :character   1st Qu.:2.000   1st Qu.: 5.800  
##  Median : 53.00   Mode  :character   Median :2.000   Median : 7.400  
##  Mean   : 56.35                      Mean   :1.906   Mean   : 7.316  
##  3rd Qu.: 87.00                      3rd Qu.:2.000   3rd Qu.: 8.800  
##  Max.   :114.00                      Max.   :5.000   Max.   :29.500
# The summary now looks much more coherent (makes more sense). Now, let's look 
# at the number of transactions over time to check for any obvious data issues 
# such as missing data. We can do this by grouping the number of
# transactions by per day, and sorting by date

#### Count the number of transactions by date
transactionData %>% group_by(DATE) %>% summarize(count = n()) %>% arrange((DATE)) 
# There are only 364 dates which means there's a missing date. Let's create a 
# sequence of dates from 1 Jul 2018 (2018-07-01) to 30 Jun 2019 (2019-06-30) 
# create a chart of number of transactions over time to find the missing date.

#### Generate a sequence of dates and join this with the count of transactions by date
allDates <- data.frame(seq(as.Date('2018-07-01'), as.Date('2019-06-30'), by = "day"))
setnames(allDates, "DATE")

#### Join allDates to the transaction data sorted by date using a Right Join.
transactions_by_day <- 
  transactionData %>% 
  group_by(DATE) %>% summarize(count = n()) %>% arrange((DATE)) 

transactions_by_day <- right_join(transactions_by_day, allDates, by = "DATE")

#### Let's plot the transactions over time; this will help us visualize the 
#### missing date
transactions_by_day %>% ggplot(aes(x = DATE, y = count)) + geom_line() +
   labs(x = "Date", y = "Number of transactions", 
        title = "Transactions per day (2018-2019)") +
   theme_bw() + scale_x_date(breaks = "1 month") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# We can see that there is a sharp increase in the number of transactions during
# December, and then a break in
# late December. Let's zoom in and check what's happening

#### Filter transaction by day only for December 2018
transactions_by_day %>% filter(month(DATE) == 12) %>% 
          ggplot(aes(x = DATE, y = count)) + geom_line() +
          labs(x = "Date", y = "Number of transactions", 
               title = "Transactions per day in December 2018") +
          theme_bw() + scale_x_date(breaks = "1 day") +
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# We can see that the increase in sales occurs in the lead-up to Christmas, and 
# that there were 0 transactions on Christmas day itself. 
# This is because shops are closed on Christmas day.

# Now that we are satisfied that the data no longer has outliers, we can move on
# to creating other features such as brand of chips or pack size, from PROD_NAME 
# We will start with pack size.

#### Pack size: we can work this out by taking the digits that are in PROD_NAME
transactionData <- transactionData %>% mutate(PACK_SIZE = parse_number(PROD_NAME))

#### Let's check if the pack sizes look sensible; count pack sizes and group them
transactionData %>% group_by(PACK_SIZE) %>% summarize(Count = n()) %>% 
  arrange(desc(PACK_SIZE))
# The largest pack size is 380, the smallest is 70. This looks reasonable. 

#### Let's plot a histogram of PACK_SIZE, since we know it's a categorical 
#### variable and not a continuous one even though it is numeric
transactionData %>% dplyr::select(PACK_SIZE) %>% 
  ggplot(aes(x = PACK_SIZE)) + geom_histogram(bins = 30)

# Pack sizes look like they have a reasonable distribution, we would d now like 
# to create brands.We can use the first word (Position 1 in string) in PROD_NAME 
# to work out the brand name

#### Brands
BRAND <- as.vector(toupper((word(transactionData$PROD_NAME, 1))))
transactionData <- cbind(transactionData, BRAND)

#### Check transaction volume grouped by Brands
transactionData %>% group_by(BRAND) %>% summarize(Count = n()) %>% 
  arrange(desc(Count))
#### Investigate the BRAND column further to check for duplicated brand names
transactionData %>% dplyr::select(BRAND) %>% distinct()
# Looks like there are some duplicate brand names - such as WW and Woolworths,
# both of which are from the same brand, Woolworths. Let's combine them

#### Clean brand names
transactionData$BRAND[transactionData$BRAND == 'RRD'] <- 'RED'
transactionData$BRAND[transactionData$BRAND == 'WW'] <- 'WOOLWORTHS'
transactionData$BRAND[transactionData$BRAND == 'DORITO'] <- 'DORITOS'
transactionData$BRAND[transactionData$BRAND == 'SNBTS'] <- 'SUNBITES'
transactionData$BRAND[transactionData$BRAND == 'INFZNS'] <- 'INFUZIONS'
transactionData$BRAND[transactionData$BRAND == 'SMITH'] <- 'SMITHS'
transactionData$BRAND[transactionData$BRAND == 'GRAIN'] <- 'GRNWVES'
transactionData$BRAND[transactionData$BRAND == 'NCC'] <- 'NATURAL'

#### Check brand names again
transactionData %>% dplyr::select(BRAND) %>% distinct() 
# reduced by 8; from 28 to 20


#### Visualize the most common brand by total sales
transactionData %>% group_by(BRAND) %>% summarize(sales = sum(PROD_QTY)) %>%
  arrange(desc(sales)) %>% 
  ggplot(aes(y= fct_reorder(BRAND, sales), x = sales, fill = BRAND)) +
  geom_bar(stat = "identity") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "none")  +
  labs(x = "Total Sales", y = "Brand")

  1. Customer Segmentation and Analytics
#### 3. Examine customer data
str(customerData)
## 'data.frame':    72637 obs. of  3 variables:
##  $ LYLTY_CARD_NBR  : int  1000 1002 1003 1004 1005 1007 1009 1010 1011 1012 ...
##  $ LIFESTAGE       : chr  "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
##  $ PREMIUM_CUSTOMER: chr  "Premium" "Mainstream" "Budget" "Mainstream" ...
# there are about 72,637 members in the customerData file who are in different
# stages in their lives. 

summary(customerData)
##  LYLTY_CARD_NBR     LIFESTAGE         PREMIUM_CUSTOMER  
##  Min.   :   1000   Length:72637       Length:72637      
##  1st Qu.:  66202   Class :character   Class :character  
##  Median : 134040   Mode  :character   Mode  :character  
##  Mean   : 136186                                        
##  3rd Qu.: 203375                                        
##  Max.   :2373711
# Let's have a closer look at the LIFESTAGE and PREMIUM_CUSTOMER columns

#### Examining the LIFESTAGE column
customerData %>% dplyr::select(LIFESTAGE) %>% distinct()
#### Group the number of customers by LIFESTAGE
customerData %>% group_by(LIFESTAGE) %>% 
  summarize(Count = n()) %>% 
  distinct() %>%
  arrange(desc(Count))
# The Loyalty Card program is most popular with the Retirees, Older Singles/Couples,
# and Young Singles/Couples

#### Examining the PREMIUM_CUSTOMER column
customerData %>% dplyr::select(PREMIUM_CUSTOMER) %>% distinct()
#### Group the number of customers by PREMIUM_CUSTOMER
customerData %>% group_by(PREMIUM_CUSTOMER) %>%
  summarize(Count = n()) %>%
  distinct() %>%
  arrange(desc(Count))
# The most common customer type is "Mainstream" (29,245)

# Since there aren't any issues with the customerData, we can now join it with 
# transactionData using a left-join of transactionData on customerData
mergedData <- left_join(transactionData, customerData, by = "LYLTY_CARD_NBR")

# The number of rows in both transactionData and mergedData seems to be the same which means the
# join was successful, and no duplicates were created. A left-join takes all the rows from the 
# left-hand table and joins them with matching rows in the right table. If there were any unknown
# LYLTY_CARD_NBR, then there would be NAs. We could have also employed an inner join using 
# inner_join(transactionData, customerData, by = "LYLTY_CARD_NBR")

# Let's check if there are any missing customers whose information did not match 
sum(is.na(mergedData))
## [1] 0
# Note: We'll use this dataset ("mergedData") for Task 2

#### 4. Data Analysis on Customer Segments

# The data is now ready for analysis. We can define some metrics of interest to the client:
  # 1. Who spends the most on chips (total sales), describing customer by lifestage,
  #    and how premium their general purchasing behavior is.
  # 2. How many customers are in each segment.
  # 3. How many chips are bought by each customer by segment.
  # 4. What's the average chip price by customer segment.
  # 5. Customer's total spend over the period and total spend for each transaction to 
  #    understand what portion of their grocery is spend on chips (Optional)
  # 6. Proportion of customer in each customer segment overall to compare against the 
  #    mix of customers who purchases chips (Optional)

# Let's start by calculating the "total sales" by LIFESTAGE and PREMIUM_CUSTOMER, and
# plotting the split by these segments to describe which customer segments buys the 
# most chips (contributes the most to chip sales)

#### Total Sales by LIFESTAGE and PREMIUM_CUSTOMER
sales_total <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>% 
  summarize(SALES = sum(TOT_SALES), .groups = "keep")

# most popular LIFESTAGE/PREMIUM_CUSTOMER type by total sales
sales_total %>% arrange(desc(SALES))
#### Create plot for sales_total
p1 <- ggplot(sales_total) + 
  geom_mosaic(aes(weight = SALES, x = product(PREMIUM_CUSTOMER, LIFESTAGE), 
                  fill = PREMIUM_CUSTOMER)) +
  labs(x = "Lifestage", y = "Premium Customer type", title = "
       Proportion of Total Sales") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_fill_brewer(palette="Dark2") 

#### Plot and label with proportion of sales
p1 + geom_text(data = ggplot_build(p1)$data[[1]], aes(x = (xmin + xmax)/2, 
                                                      y = (ymin + ymax)/2,
              label = as.character(paste(round(.wt/sum(.wt), 3)*100, "%"))))

# Looks like most of the contributions made to chip sales is coming from Budget 
# Older Families, Mainstream Young Singles/Couples, and Mainstream Retirees. 
# There is a possibility this could be due to these groups having more customers 
# that buy chips (multidisciplinary)

#### Number of customers by LIFESTAGE and PREMIUM_CUSTOMER
customers <- mergedData %>% 
  group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>% 
  summarize(CUSTOMERS = uniqueN(LYLTY_CARD_NBR), .groups = "keep") %>%
  arrange(desc(CUSTOMERS))

#### Create Plot
p2 <- ggplot(data = customers) + 
  geom_mosaic(aes(weight = CUSTOMERS, x = product(PREMIUM_CUSTOMER, LIFESTAGE), 
                  fill = PREMIUM_CUSTOMER)) +
  labs(x = "Lifestage", y = "Premium Customer type", 
       title = "Proportion of Total Sales") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) 

#### Plot and label with proportion of customers
p2 + geom_text(data = ggplot_build(p2)$data[[1]], aes(x = (xmin + xmax)/2, 
                                                      y = (ymin + ymax)/2,
                      label = as.character(paste(round(.wt/sum(.wt), 3)*100, "%")))) +
  scale_fill_brewer(palette="Dark2") 

# There are more Mainstream Young Singles/Couples and Mainstream Retirees who buy chips. 
# This contributes to there being more sales to these customer segments. 
# However this is not a major driver for the Budget Older families Segment.

#### Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMERS
sales_avg <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
             summarise(AVG = sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR), .groups = "keep")

sales_avg %>% arrange(desc(AVG))
#### Create plot
ggplot(data = sales_avg, aes(weight = AVG, x = LIFESTAGE, fill = PREMIUM_CUSTOMER)) + 
  geom_bar(position = position_dodge()) + 
  labs(x = "Lifestage", y = "Avg units per transaction", title = "Units per customer") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) + 
  scale_fill_brewer(palette="Dark2")

# Older and younger families in general buy more chips per customer
# Let's also investigate the average price per unit of chips, bought for each customer
# segment as this is also a driver of total sales

avg_price <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>% 
             summarize(AVG = sum(TOT_SALES)/sum(PROD_QTY), .groups = "keep") %>%
             arrange(desc(AVG))

head(avg_price)
#### Create plot
ggplot(data = avg_price, aes(weight = AVG, x = LIFESTAGE, fill = PREMIUM_CUSTOMER)) +
  geom_bar(position = position_dodge()) +
  labs(x = "Lifestage", y = "Avg price per unit in AU$", title = "Price per unit") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) + 
  scale_fill_brewer(palette="Dark2")

# Mainstream Midage Singles/Couples and Mainstream Young Singles/Couples on an average
# pay more for each packet of chips compared to their budget and premium counterparts.
# This could also indicate that premium customer are more likely to buy chips, and only
# for particular occasions, rather than their own consumption. This hypothesis is further
# supported by the fact that there are fewer premium and budget Young Sigles/Couples, and
# Midage Singles/Couples who buy chips compared to their maintream counterparts.

#### Perform an independent t-test between mainstream vs premium and budget midage, and 
#### mainstream vs premium and budget young singles and couples
mergedData <- mergedData %>% mutate(price = TOT_SALES/PROD_QTY)

m1 <- mergedData %>% 
  filter(LIFESTAGE %in% c("YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES") 
                            & PREMIUM_CUSTOMER == "Mainstream") %>% dplyr::select(price)

m2 <- mergedData %>% 
  filter(LIFESTAGE %in% c("YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES") 
                            & PREMIUM_CUSTOMER != "Mainstream") %>% dplyr::select(price)

t.test(m1, m2, alternative = "greater")
## 
##  Welch Two Sample t-test
## 
## data:  m1 and m2
## t = 37.624, df = 54791, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.3187234       Inf
## sample estimates:
## mean of x mean of y 
##  4.039786  3.706491
# p-value less that 0.05 implies that the unit price for mainstream, young and mid-age
# singles and couples are significantly higher than there budget and premium counterparts
df <- mergedData %>% 
  mutate(GROUP = if_else(PREMIUM_CUSTOMER == "Mainstream", "Mainstream", 
                         "Non-Mainstream")) %>%
  dplyr::select(GROUP, price)
head(df)
df %>% ggplot(aes(x = price, fill = GROUP)) + 
  geom_histogram( bins = 9) +
  scale_color_brewer(palette="Dark2") +
  scale_fill_brewer(palette="Dark2") + 
  theme_bw() +
  labs(x = "Avg price per unit (AU$)", y = "Count")

#### 5. Deep diving into specific customer segments for insights

# We discovered quite a few interesting insights that we can investigate further.

# We might want to target customer segments that contribute the most to sales, 
# to retain them for further increase in sales. Let's start by looking at Mainstream
# Young Singles/Couples. For example, let's try and find out if they tend to buy a
# particular brand of chips

#### Deep diving into Mainstream, Young Singles/Couples
segment1 <- mergedData %>% 
  filter(LIFESTAGE == "YOUNG SINGLES/COUPLES" & PREMIUM_CUSTOMER == "Mainstream")

other <- mergedData %>% 
  filter(LIFESTAGE != "YOUNG SINGLES/COUPLES" & PREMIUM_CUSTOMER == "Mainstream")

quantity_segment1 <- sum(segment1$PROD_QTY)

quantity_other <- sum(other$PROD_QTY)

targetSegment1 <- segment1 %>% group_by(BRAND) %>% 
                summarise(targetSegment = sum(PROD_QTY)/quantity_segment1) 

targetOther <- other %>% group_by(BRAND) %>% 
                        summarise(other = sum(PROD_QTY)/quantity_other)

brand_proportions <- merge(targetSegment1, targetOther) %>% 
  mutate(affinityToBrand = targetSegment/other) 
 
brand_proportions %>% arrange(desc(affinityToBrand))
# We observe that:
# 1. Mainstream Young Singles/Couples are 23% more likely to buy TYRRELLS compared
  # to rest of the population
# 2. Mainstream Young Singles/Couples are 56% less likely to buy BURGER compared 
  # to rest of the population

# Let's find out if our target segment tends to also buy larger packs of chips
targetSegment1_by_pack <- segment1 %>% 
  group_by(PACK_SIZE) %>% 
  summarise(targetSegment = sum(PROD_QTY)/quantity_segment1)

targetOther_by_pack <- other %>% 
  group_by(PACK_SIZE) %>% 
  summarise(other = sum(PROD_QTY/quantity_other))

pack_proportions <- merge(targetSegment1_by_pack, targetOther_by_pack) %>% 
  mutate(affinityToPack = targetSegment/other)

pack_proportions %>% arrange(desc(affinityToPack))
# Mainstream Young Singles/Couples are 25% more likely to buy 270g pack of chips
# compared to the rest of the population. Which brands sell packs of this size?
mergedData %>% filter(PACK_SIZE == 270) %>% dplyr::select(PROD_NAME) %>% distinct()
# Twisties are the only brand that sell products of this size. This perhaps therefore
# reflects a higher likelihood of purchasing Twisties products
  1. Conclusion

The Category Manager may want to increase the category’s performance by off-locating some Tyrrells and smaller packs of chips in the discretionary space near segments where Young Singles/Couples frequent more often, to increase visibility and impulsive behavior.

Quantium can help the Category Manager with the following: