#### Load libraries
library(ggplot2)
library(tidyverse)
library(tibble)
library(cowplot)
library(fitdistrplus)
library(data.table)
library(readr)
library(readxl)
library(stringr)
library(formatR)
library(ggmosaic)
#### Load dataset
transactionData <- read_excel("QVI_transaction_data.xlsx")
customerData <- read.csv("QVI_purchase_behaviour.csv")#### 2. Examine transaction data
str(transactionData)## tibble [264,836 × 8] (S3: tbl_df/tbl/data.frame)
## $ DATE : num [1:264836] 43390 43599 43605 43329 43330 ...
## $ STORE_NBR : num [1:264836] 1 1 1 2 2 4 4 4 5 7 ...
## $ LYLTY_CARD_NBR: num [1:264836] 1000 1307 1343 2373 2426 ...
## $ TXN_ID : num [1:264836] 1 348 383 974 1038 ...
## $ PROD_NBR : num [1:264836] 5 66 61 69 108 57 16 24 42 52 ...
## $ PROD_NAME : chr [1:264836] "Natural Chip Compny SeaSalt175g" "CCs Nacho Cheese 175g" "Smiths Crinkle Cut Chips Chicken 170g" "Smiths Chip Thinly S/Cream&Onion 175g" ...
## $ PROD_QTY : num [1:264836] 2 3 2 5 3 1 1 1 1 2 ...
## $ TOT_SALES : num [1:264836] 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
# Date is in integer format; change it to date
transactionData$DATE <- as.Date(transactionData$DATE, origin = "1899-12-30")
# Check that we are looking at the right products by examining PROD_NAME;
# examine PROD_NAME
str(transactionData$PROD_NAME)## chr [1:264836] "Natural Chip Compny SeaSalt175g" ...
transactionData %>% count(PROD_NAME, sort = TRUE) %>% unique()# Looks like we are definitely looking at potato chips. But we want to be sure
# these are all "chips"
# Perform some basic text analysis by summarizing the individual words in the
# product name
#### Examine the words in PROD_NAME to see if there are any incorrect entries
#### such as products that are NOT chips
productWords <- strsplit(unique(as.character(transactionData$PROD_NAME)), " ") %>%
unlist() %>%
as.data.frame()
setnames(productWords, "Words")
#### Remove digits by filter out all those rows that contain digits using
# str_detect()
productWords <- productWords %>% filter(!(str_detect(Words, "\\d")))
#### Remove special characters
productWords <- productWords %>% filter(str_detect(Words, "[:alpha:]"))
# Let's look at the most common words by counting the number of times a word
# appears and sorting them by this frequency in a descending order (highest to
# lowest)
#### Sort words from highest to lowest frequency
productWords %>%
group_by(Words) %>% summarize(Count = n()) %>% arrange(desc(Count))# We might want to remove the Salsa because they're not considered chips
#### Remove Salsa products
transactionData <- transactionData %>%
dplyr::filter(!grepl('Salsa', PROD_NAME))
#### Check if any Salsa products are there
transactionData %>%
dplyr::filter(grepl('Salsa', PROD_NAME)) #### Let's check some transactions w.r.t. Loyalty Card data.
# Unique Loyalty Cards
transactionData %>% dplyr::select(LYLTY_CARD_NBR) %>% distinct()# Top 50 repeat customers
transactionData %>% group_by(LYLTY_CARD_NBR) %>% summarise(count = n()) %>%
filter(count > 1) %>% arrange(desc(count)) %>% print(n = 50)## # A tibble: 51,757 × 2
## LYLTY_CARD_NBR count
## <dbl> <int>
## 1 162039 17
## 2 230078 17
## 3 23192 16
## 4 105026 16
## 5 113080 16
## 6 116181 16
## 7 128178 16
## 8 172032 16
## 9 179228 16
## 10 30162 15
## 11 94185 15
## 12 104117 15
## 13 107167 15
## 14 109036 15
## 15 112231 15
## 16 129050 15
## 17 138016 15
## 18 152054 15
## 19 156240 15
## 20 207003 15
## 21 213140 15
## 22 222242 15
## 23 259100 15
## 24 5168 14
## 25 13177 14
## 26 28020 14
## 27 32060 14
## 28 33073 14
## 29 43109 14
## 30 48155 14
## 31 57033 14
## 32 58361 14
## 33 62093 14
## 34 63197 14
## 35 71075 14
## 36 100130 14
## 37 104061 14
## 38 105016 14
## 39 109080 14
## 40 112168 14
## 41 112218 14
## 42 118124 14
## 43 129087 14
## 44 133138 14
## 45 138222 14
## 46 152094 14
## 47 156054 14
## 48 164034 14
## 49 179110 14
## 50 179213 14
## # … with 51,707 more rows
# Multiple Transaction ID's
transactionData %>% group_by(TXN_ID) %>% summarise(count = n()) %>%
filter(count > 1) %>% arrange(desc(count)) %>% print(n = 10)## # A tibble: 1,479 × 2
## TXN_ID count
## <dbl> <int>
## 1 1162 3
## 2 102237 3
## 3 108462 3
## 4 211119 3
## 5 228741 3
## 6 230356 3
## 7 517 2
## 8 628 2
## 9 1142 2
## 10 1498 2
## # … with 1,469 more rows
# Check for the repeat transactions per Loyalty Card Number
transactionData %>% group_by(LYLTY_CARD_NBR, TXN_ID) %>%
summarize(count = n()) %>%
filter(count > 1) %>% arrange(desc(count))#### Summarize the data to check for nulls and possible outliers
summary(transactionData)## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID
## Min. :2018-07-01 Min. : 1.0 Min. : 1000 Min. : 1
## 1st Qu.:2018-09-30 1st Qu.: 70.0 1st Qu.: 70015 1st Qu.: 67569
## Median :2018-12-30 Median :130.0 Median : 130367 Median : 135183
## Mean :2018-12-30 Mean :135.1 Mean : 135531 Mean : 135131
## 3rd Qu.:2019-03-31 3rd Qu.:203.0 3rd Qu.: 203084 3rd Qu.: 202654
## Max. :2019-06-30 Max. :272.0 Max. :2373711 Max. :2415841
## PROD_NBR PROD_NAME PROD_QTY TOT_SALES
## Min. : 1.00 Length:246742 Min. : 1.000 Min. : 1.700
## 1st Qu.: 26.00 Class :character 1st Qu.: 2.000 1st Qu.: 5.800
## Median : 53.00 Mode :character Median : 2.000 Median : 7.400
## Mean : 56.35 Mean : 1.908 Mean : 7.321
## 3rd Qu.: 87.00 3rd Qu.: 2.000 3rd Qu.: 8.800
## Max. :114.00 Max. :200.000 Max. :650.000
# There's an anomaly where the product quantity is 200. Let's investigate
# this further
transactionData %>% filter(PROD_QTY == 200) # Same customer as identified by LYLTY_CARD_NBR 226000. Let's see if this
# customer had any other transactions
transactionData %>% filter(LYLTY_CARD_NBR == 226000)# This customer has had no other transactions recorded in the 12-month period,
# and they've only purchased twice
# Both the times it was 200 packets of chips. This is an obvious anomaly and we can
# remove this from the data
#### Filter the loyalty number causing the anomaly
transactionData <- transactionData %>% filter(LYLTY_CARD_NBR != 226000)
#### Re-examine the data
summary(transactionData)## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID
## Min. :2018-07-01 Min. : 1.0 Min. : 1000 Min. : 1
## 1st Qu.:2018-09-30 1st Qu.: 70.0 1st Qu.: 70015 1st Qu.: 67569
## Median :2018-12-30 Median :130.0 Median : 130367 Median : 135182
## Mean :2018-12-30 Mean :135.1 Mean : 135530 Mean : 135130
## 3rd Qu.:2019-03-31 3rd Qu.:203.0 3rd Qu.: 203083 3rd Qu.: 202652
## Max. :2019-06-30 Max. :272.0 Max. :2373711 Max. :2415841
## PROD_NBR PROD_NAME PROD_QTY TOT_SALES
## Min. : 1.00 Length:246740 Min. :1.000 Min. : 1.700
## 1st Qu.: 26.00 Class :character 1st Qu.:2.000 1st Qu.: 5.800
## Median : 53.00 Mode :character Median :2.000 Median : 7.400
## Mean : 56.35 Mean :1.906 Mean : 7.316
## 3rd Qu.: 87.00 3rd Qu.:2.000 3rd Qu.: 8.800
## Max. :114.00 Max. :5.000 Max. :29.500
# The summary now looks much more coherent (makes more sense). Now, let's look
# at the number of transactions over time to check for any obvious data issues
# such as missing data. We can do this by grouping the number of
# transactions by per day, and sorting by date
#### Count the number of transactions by date
transactionData %>% group_by(DATE) %>% summarize(count = n()) %>% arrange((DATE)) # There are only 364 dates which means there's a missing date. Let's create a
# sequence of dates from 1 Jul 2018 (2018-07-01) to 30 Jun 2019 (2019-06-30)
# create a chart of number of transactions over time to find the missing date.
#### Generate a sequence of dates and join this with the count of transactions by date
allDates <- data.frame(seq(as.Date('2018-07-01'), as.Date('2019-06-30'), by = "day"))
setnames(allDates, "DATE")
#### Join allDates to the transaction data sorted by date using a Right Join.
transactions_by_day <-
transactionData %>%
group_by(DATE) %>% summarize(count = n()) %>% arrange((DATE))
transactions_by_day <- right_join(transactions_by_day, allDates, by = "DATE")
#### Let's plot the transactions over time; this will help us visualize the
#### missing date
transactions_by_day %>% ggplot(aes(x = DATE, y = count)) + geom_line() +
labs(x = "Date", y = "Number of transactions",
title = "Transactions per day (2018-2019)") +
theme_bw() + scale_x_date(breaks = "1 month") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))# We can see that there is a sharp increase in the number of transactions during
# December, and then a break in
# late December. Let's zoom in and check what's happening
#### Filter transaction by day only for December 2018
transactions_by_day %>% filter(month(DATE) == 12) %>%
ggplot(aes(x = DATE, y = count)) + geom_line() +
labs(x = "Date", y = "Number of transactions",
title = "Transactions per day in December 2018") +
theme_bw() + scale_x_date(breaks = "1 day") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))# We can see that the increase in sales occurs in the lead-up to Christmas, and
# that there were 0 transactions on Christmas day itself.
# This is because shops are closed on Christmas day.
# Now that we are satisfied that the data no longer has outliers, we can move on
# to creating other features such as brand of chips or pack size, from PROD_NAME
# We will start with pack size.
#### Pack size: we can work this out by taking the digits that are in PROD_NAME
transactionData <- transactionData %>% mutate(PACK_SIZE = parse_number(PROD_NAME))
#### Let's check if the pack sizes look sensible; count pack sizes and group them
transactionData %>% group_by(PACK_SIZE) %>% summarize(Count = n()) %>%
arrange(desc(PACK_SIZE))# The largest pack size is 380, the smallest is 70. This looks reasonable.
#### Let's plot a histogram of PACK_SIZE, since we know it's a categorical
#### variable and not a continuous one even though it is numeric
transactionData %>% dplyr::select(PACK_SIZE) %>%
ggplot(aes(x = PACK_SIZE)) + geom_histogram(bins = 30)# Pack sizes look like they have a reasonable distribution, we would d now like
# to create brands.We can use the first word (Position 1 in string) in PROD_NAME
# to work out the brand name
#### Brands
BRAND <- as.vector(toupper((word(transactionData$PROD_NAME, 1))))
transactionData <- cbind(transactionData, BRAND)
#### Check transaction volume grouped by Brands
transactionData %>% group_by(BRAND) %>% summarize(Count = n()) %>%
arrange(desc(Count))#### Investigate the BRAND column further to check for duplicated brand names
transactionData %>% dplyr::select(BRAND) %>% distinct()# Looks like there are some duplicate brand names - such as WW and Woolworths,
# both of which are from the same brand, Woolworths. Let's combine them
#### Clean brand names
transactionData$BRAND[transactionData$BRAND == 'RRD'] <- 'RED'
transactionData$BRAND[transactionData$BRAND == 'WW'] <- 'WOOLWORTHS'
transactionData$BRAND[transactionData$BRAND == 'DORITO'] <- 'DORITOS'
transactionData$BRAND[transactionData$BRAND == 'SNBTS'] <- 'SUNBITES'
transactionData$BRAND[transactionData$BRAND == 'INFZNS'] <- 'INFUZIONS'
transactionData$BRAND[transactionData$BRAND == 'SMITH'] <- 'SMITHS'
transactionData$BRAND[transactionData$BRAND == 'GRAIN'] <- 'GRNWVES'
transactionData$BRAND[transactionData$BRAND == 'NCC'] <- 'NATURAL'
#### Check brand names again
transactionData %>% dplyr::select(BRAND) %>% distinct() # reduced by 8; from 28 to 20
#### Visualize the most common brand by total sales
transactionData %>% group_by(BRAND) %>% summarize(sales = sum(PROD_QTY)) %>%
arrange(desc(sales)) %>%
ggplot(aes(y= fct_reorder(BRAND, sales), x = sales, fill = BRAND)) +
geom_bar(stat = "identity") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90), legend.position = "none") +
labs(x = "Total Sales", y = "Brand")#### 3. Examine customer data
str(customerData)## 'data.frame': 72637 obs. of 3 variables:
## $ LYLTY_CARD_NBR : int 1000 1002 1003 1004 1005 1007 1009 1010 1011 1012 ...
## $ LIFESTAGE : chr "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
## $ PREMIUM_CUSTOMER: chr "Premium" "Mainstream" "Budget" "Mainstream" ...
# there are about 72,637 members in the customerData file who are in different
# stages in their lives.
summary(customerData)## LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
## Min. : 1000 Length:72637 Length:72637
## 1st Qu.: 66202 Class :character Class :character
## Median : 134040 Mode :character Mode :character
## Mean : 136186
## 3rd Qu.: 203375
## Max. :2373711
# Let's have a closer look at the LIFESTAGE and PREMIUM_CUSTOMER columns
#### Examining the LIFESTAGE column
customerData %>% dplyr::select(LIFESTAGE) %>% distinct()#### Group the number of customers by LIFESTAGE
customerData %>% group_by(LIFESTAGE) %>%
summarize(Count = n()) %>%
distinct() %>%
arrange(desc(Count))# The Loyalty Card program is most popular with the Retirees, Older Singles/Couples,
# and Young Singles/Couples
#### Examining the PREMIUM_CUSTOMER column
customerData %>% dplyr::select(PREMIUM_CUSTOMER) %>% distinct()#### Group the number of customers by PREMIUM_CUSTOMER
customerData %>% group_by(PREMIUM_CUSTOMER) %>%
summarize(Count = n()) %>%
distinct() %>%
arrange(desc(Count))# The most common customer type is "Mainstream" (29,245)
# Since there aren't any issues with the customerData, we can now join it with
# transactionData using a left-join of transactionData on customerData
mergedData <- left_join(transactionData, customerData, by = "LYLTY_CARD_NBR")
# The number of rows in both transactionData and mergedData seems to be the same which means the
# join was successful, and no duplicates were created. A left-join takes all the rows from the
# left-hand table and joins them with matching rows in the right table. If there were any unknown
# LYLTY_CARD_NBR, then there would be NAs. We could have also employed an inner join using
# inner_join(transactionData, customerData, by = "LYLTY_CARD_NBR")
# Let's check if there are any missing customers whose information did not match
sum(is.na(mergedData))## [1] 0
# Note: We'll use this dataset ("mergedData") for Task 2
#### 4. Data Analysis on Customer Segments
# The data is now ready for analysis. We can define some metrics of interest to the client:
# 1. Who spends the most on chips (total sales), describing customer by lifestage,
# and how premium their general purchasing behavior is.
# 2. How many customers are in each segment.
# 3. How many chips are bought by each customer by segment.
# 4. What's the average chip price by customer segment.
# 5. Customer's total spend over the period and total spend for each transaction to
# understand what portion of their grocery is spend on chips (Optional)
# 6. Proportion of customer in each customer segment overall to compare against the
# mix of customers who purchases chips (Optional)
# Let's start by calculating the "total sales" by LIFESTAGE and PREMIUM_CUSTOMER, and
# plotting the split by these segments to describe which customer segments buys the
# most chips (contributes the most to chip sales)
#### Total Sales by LIFESTAGE and PREMIUM_CUSTOMER
sales_total <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
summarize(SALES = sum(TOT_SALES), .groups = "keep")
# most popular LIFESTAGE/PREMIUM_CUSTOMER type by total sales
sales_total %>% arrange(desc(SALES))#### Create plot for sales_total
p1 <- ggplot(sales_total) +
geom_mosaic(aes(weight = SALES, x = product(PREMIUM_CUSTOMER, LIFESTAGE),
fill = PREMIUM_CUSTOMER)) +
labs(x = "Lifestage", y = "Premium Customer type", title = "
Proportion of Total Sales") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2")
#### Plot and label with proportion of sales
p1 + geom_text(data = ggplot_build(p1)$data[[1]], aes(x = (xmin + xmax)/2,
y = (ymin + ymax)/2,
label = as.character(paste(round(.wt/sum(.wt), 3)*100, "%"))))# Looks like most of the contributions made to chip sales is coming from Budget
# Older Families, Mainstream Young Singles/Couples, and Mainstream Retirees.
# There is a possibility this could be due to these groups having more customers
# that buy chips (multidisciplinary)
#### Number of customers by LIFESTAGE and PREMIUM_CUSTOMER
customers <- mergedData %>%
group_by(PREMIUM_CUSTOMER, LIFESTAGE) %>%
summarize(CUSTOMERS = uniqueN(LYLTY_CARD_NBR), .groups = "keep") %>%
arrange(desc(CUSTOMERS))
#### Create Plot
p2 <- ggplot(data = customers) +
geom_mosaic(aes(weight = CUSTOMERS, x = product(PREMIUM_CUSTOMER, LIFESTAGE),
fill = PREMIUM_CUSTOMER)) +
labs(x = "Lifestage", y = "Premium Customer type",
title = "Proportion of Total Sales") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
#### Plot and label with proportion of customers
p2 + geom_text(data = ggplot_build(p2)$data[[1]], aes(x = (xmin + xmax)/2,
y = (ymin + ymax)/2,
label = as.character(paste(round(.wt/sum(.wt), 3)*100, "%")))) +
scale_fill_brewer(palette="Dark2") # There are more Mainstream Young Singles/Couples and Mainstream Retirees who buy chips.
# This contributes to there being more sales to these customer segments.
# However this is not a major driver for the Budget Older families Segment.
#### Average number of units per customer by LIFESTAGE and PREMIUM_CUSTOMERS
sales_avg <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
summarise(AVG = sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR), .groups = "keep")
sales_avg %>% arrange(desc(AVG))#### Create plot
ggplot(data = sales_avg, aes(weight = AVG, x = LIFESTAGE, fill = PREMIUM_CUSTOMER)) +
geom_bar(position = position_dodge()) +
labs(x = "Lifestage", y = "Avg units per transaction", title = "Units per customer") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2")# Older and younger families in general buy more chips per customer
# Let's also investigate the average price per unit of chips, bought for each customer
# segment as this is also a driver of total sales
avg_price <- mergedData %>% group_by(LIFESTAGE, PREMIUM_CUSTOMER) %>%
summarize(AVG = sum(TOT_SALES)/sum(PROD_QTY), .groups = "keep") %>%
arrange(desc(AVG))
head(avg_price)#### Create plot
ggplot(data = avg_price, aes(weight = AVG, x = LIFESTAGE, fill = PREMIUM_CUSTOMER)) +
geom_bar(position = position_dodge()) +
labs(x = "Lifestage", y = "Avg price per unit in AU$", title = "Price per unit") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette="Dark2")# Mainstream Midage Singles/Couples and Mainstream Young Singles/Couples on an average
# pay more for each packet of chips compared to their budget and premium counterparts.
# This could also indicate that premium customer are more likely to buy chips, and only
# for particular occasions, rather than their own consumption. This hypothesis is further
# supported by the fact that there are fewer premium and budget Young Sigles/Couples, and
# Midage Singles/Couples who buy chips compared to their maintream counterparts.
#### Perform an independent t-test between mainstream vs premium and budget midage, and
#### mainstream vs premium and budget young singles and couples
mergedData <- mergedData %>% mutate(price = TOT_SALES/PROD_QTY)
m1 <- mergedData %>%
filter(LIFESTAGE %in% c("YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES")
& PREMIUM_CUSTOMER == "Mainstream") %>% dplyr::select(price)
m2 <- mergedData %>%
filter(LIFESTAGE %in% c("YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES")
& PREMIUM_CUSTOMER != "Mainstream") %>% dplyr::select(price)
t.test(m1, m2, alternative = "greater")##
## Welch Two Sample t-test
##
## data: m1 and m2
## t = 37.624, df = 54791, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.3187234 Inf
## sample estimates:
## mean of x mean of y
## 4.039786 3.706491
# p-value less that 0.05 implies that the unit price for mainstream, young and mid-age
# singles and couples are significantly higher than there budget and premium counterparts
df <- mergedData %>%
mutate(GROUP = if_else(PREMIUM_CUSTOMER == "Mainstream", "Mainstream",
"Non-Mainstream")) %>%
dplyr::select(GROUP, price)
head(df)df %>% ggplot(aes(x = price, fill = GROUP)) +
geom_histogram( bins = 9) +
scale_color_brewer(palette="Dark2") +
scale_fill_brewer(palette="Dark2") +
theme_bw() +
labs(x = "Avg price per unit (AU$)", y = "Count")#### 5. Deep diving into specific customer segments for insights
# We discovered quite a few interesting insights that we can investigate further.
# We might want to target customer segments that contribute the most to sales,
# to retain them for further increase in sales. Let's start by looking at Mainstream
# Young Singles/Couples. For example, let's try and find out if they tend to buy a
# particular brand of chips
#### Deep diving into Mainstream, Young Singles/Couples
segment1 <- mergedData %>%
filter(LIFESTAGE == "YOUNG SINGLES/COUPLES" & PREMIUM_CUSTOMER == "Mainstream")
other <- mergedData %>%
filter(LIFESTAGE != "YOUNG SINGLES/COUPLES" & PREMIUM_CUSTOMER == "Mainstream")
quantity_segment1 <- sum(segment1$PROD_QTY)
quantity_other <- sum(other$PROD_QTY)
targetSegment1 <- segment1 %>% group_by(BRAND) %>%
summarise(targetSegment = sum(PROD_QTY)/quantity_segment1)
targetOther <- other %>% group_by(BRAND) %>%
summarise(other = sum(PROD_QTY)/quantity_other)
brand_proportions <- merge(targetSegment1, targetOther) %>%
mutate(affinityToBrand = targetSegment/other)
brand_proportions %>% arrange(desc(affinityToBrand))# We observe that:
# 1. Mainstream Young Singles/Couples are 23% more likely to buy TYRRELLS compared
# to rest of the population
# 2. Mainstream Young Singles/Couples are 56% less likely to buy BURGER compared
# to rest of the population
# Let's find out if our target segment tends to also buy larger packs of chips
targetSegment1_by_pack <- segment1 %>%
group_by(PACK_SIZE) %>%
summarise(targetSegment = sum(PROD_QTY)/quantity_segment1)
targetOther_by_pack <- other %>%
group_by(PACK_SIZE) %>%
summarise(other = sum(PROD_QTY/quantity_other))
pack_proportions <- merge(targetSegment1_by_pack, targetOther_by_pack) %>%
mutate(affinityToPack = targetSegment/other)
pack_proportions %>% arrange(desc(affinityToPack))# Mainstream Young Singles/Couples are 25% more likely to buy 270g pack of chips
# compared to the rest of the population. Which brands sell packs of this size?
mergedData %>% filter(PACK_SIZE == 270) %>% dplyr::select(PROD_NAME) %>% distinct()# Twisties are the only brand that sell products of this size. This perhaps therefore
# reflects a higher likelihood of purchasing Twisties productsSales have mainly been due to Budget - Older Families, Mainstream - Young Singles/Couple and Mainstream - Retirees Shoppers
The high volume of chip sales among the Mainstream Young Singles/Couples and Retirees can be attributed to the fact that there are more of them who buy chips, compared to others.
Mainstream, Midage and Young Singles/Couples are also more likely to pay more per packet of chips. This is indicative of impulse buying behavior.
Mainstream Young Singles/Couples are 23% more likely to buy Tyrells chips compared to the rest of the population
The Category Manager may want to increase the category’s performance by off-locating some Tyrrells and smaller packs of chips in the discretionary space near segments where Young Singles/Couples frequent more often, to increase visibility and impulsive behavior.
Quantium can help the Category Manager with the following:
Recommend where these segments are
Measuring the impact of changed placement