library(data.table)
library(ggplot2)
library(readr)
library(stringr)
library(dplyr)
filePath <- '/Users/daniel/Desktop/Data/R/Quantium/'
transactionData <- fread(paste0(filePath, 'QVI_transaction_data.csv'))
customerData <- fread(paste0(filePath, 'QVI_purchase_behaviour.csv'))
head(transactionData) # inspecting the first six rows of the dataset
##     DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
##    <int>     <int>          <int>  <int>    <int>
## 1: 43390         1           1000      1        5
## 2: 43599         1           1307    348       66
## 3: 43605         1           1343    383       61
## 4: 43329         2           2373    974       69
## 5: 43330         2           2426   1038      108
## 6: 43604         4           4074   2982       57
##                                   PROD_NAME PROD_QTY TOT_SALES
##                                      <char>    <int>     <num>
## 1:   Natural Chip        Compny SeaSalt175g        2       6.0
## 2:                 CCs Nacho Cheese    175g        3       6.3
## 3:   Smiths Crinkle Cut  Chips Chicken 170g        2       2.9
## 4:   Smiths Chip Thinly  S/Cream&Onion 175g        5      15.0
## 5: Kettle Tortilla ChpsHny&Jlpno Chili 150g        3      13.8
## 6: Old El Paso Salsa   Dip Tomato Mild 300g        1       5.1

The ‘DATE’ column is not in DATE format

str(transactionData)
## Classes 'data.table' and 'data.frame':   264836 obs. of  8 variables:
##  $ DATE          : int  43390 43599 43605 43329 43330 43604 43601 43601 43332 43330 ...
##  $ STORE_NBR     : int  1 1 1 2 2 4 4 4 5 7 ...
##  $ LYLTY_CARD_NBR: int  1000 1307 1343 2373 2426 4074 4149 4196 5026 7150 ...
##  $ TXN_ID        : int  1 348 383 974 1038 2982 3333 3539 4525 6900 ...
##  $ PROD_NBR      : int  5 66 61 69 108 57 16 24 42 52 ...
##  $ PROD_NAME     : chr  "Natural Chip        Compny SeaSalt175g" "CCs Nacho Cheese    175g" "Smiths Crinkle Cut  Chips Chicken 170g" "Smiths Chip Thinly  S/Cream&Onion 175g" ...
##  $ PROD_QTY      : int  2 3 2 5 3 1 1 1 1 2 ...
##  $ TOT_SALES     : num  6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# converting DATE column to Date format
transactionData$DATE <- as.Date(transactionData$DATE, origin = "1899-12-30")

head(transactionData) # checking DATE column is accurate 
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
##        <Date>     <int>          <int>  <int>    <int>
## 1: 2018-10-17         1           1000      1        5
## 2: 2019-05-14         1           1307    348       66
## 3: 2019-05-20         1           1343    383       61
## 4: 2018-08-17         2           2373    974       69
## 5: 2018-08-18         2           2426   1038      108
## 6: 2019-05-19         4           4074   2982       57
##                                   PROD_NAME PROD_QTY TOT_SALES
##                                      <char>    <int>     <num>
## 1:   Natural Chip        Compny SeaSalt175g        2       6.0
## 2:                 CCs Nacho Cheese    175g        3       6.3
## 3:   Smiths Crinkle Cut  Chips Chicken 170g        2       2.9
## 4:   Smiths Chip Thinly  S/Cream&Onion 175g        5      15.0
## 5: Kettle Tortilla ChpsHny&Jlpno Chili 150g        3      13.8
## 6: Old El Paso Salsa   Dip Tomato Mild 300g        1       5.1
summary(transactionData) # checking for null values and outliers
##       DATE              STORE_NBR     LYLTY_CARD_NBR        TXN_ID       
##  Min.   :2018-07-01   Min.   :  1.0   Min.   :   1000   Min.   :      1  
##  1st Qu.:2018-09-30   1st Qu.: 70.0   1st Qu.:  70021   1st Qu.:  67602  
##  Median :2018-12-30   Median :130.0   Median : 130358   Median : 135138  
##  Mean   :2018-12-30   Mean   :135.1   Mean   : 135550   Mean   : 135158  
##  3rd Qu.:2019-03-31   3rd Qu.:203.0   3rd Qu.: 203094   3rd Qu.: 202701  
##  Max.   :2019-06-30   Max.   :272.0   Max.   :2373711   Max.   :2415841  
##     PROD_NBR       PROD_NAME            PROD_QTY         TOT_SALES      
##  Min.   :  1.00   Length:264836      Min.   :  1.000   Min.   :  1.500  
##  1st Qu.: 28.00   Class :character   1st Qu.:  2.000   1st Qu.:  5.400  
##  Median : 56.00   Mode  :character   Median :  2.000   Median :  7.400  
##  Mean   : 56.58                      Mean   :  1.907   Mean   :  7.304  
##  3rd Qu.: 85.00                      3rd Qu.:  2.000   3rd Qu.:  9.200  
##  Max.   :114.00                      Max.   :200.000   Max.   :650.000
transactionData[PROD_QTY == 200] # inspecting outlier of PROD_QTY at 200 
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
##        <Date>     <int>          <int>  <int>    <int>
## 1: 2018-08-19       226         226000 226201        4
## 2: 2019-05-20       226         226000 226210        4
##                           PROD_NAME PROD_QTY TOT_SALES
##                              <char>    <int>     <num>
## 1: Dorito Corn Chp     Supreme 380g      200       650
## 2: Dorito Corn Chp     Supreme 380g      200       650
transactionData[LYLTY_CARD_NBR == 226000] # other transactions for this customer
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
##        <Date>     <int>          <int>  <int>    <int>
## 1: 2018-08-19       226         226000 226201        4
## 2: 2019-05-20       226         226000 226210        4
##                           PROD_NAME PROD_QTY TOT_SALES
##                              <char>    <int>     <num>
## 1: Dorito Corn Chp     Supreme 380g      200       650
## 2: Dorito Corn Chp     Supreme 380g      200       650
# removing outlier, not representative of customer spending habits
transactionData <- transactionData[!LYLTY_CARD_NBR == 226000]

Client is focused on chip spending habits

productWords <- data.table(words = unlist(strsplit(transactionData$PROD_NAME, " ")))

productWordsCount <- productWords[, .(count = .N), by = words][order(-count)]

head(productWordsCount, 25) # checking for non-chip items
##        words  count
##       <char>  <int>
##  1:          504830
##  2:     175g  60561
##  3:    Chips  49770
##  4:     150g  41633
##  5:   Kettle  41288
##  6:        &  35565
##  7:   Smiths  28860
##  8:     Salt  27976
##  9:   Cheese  27890
## 10: Pringles  25102
## 11:     134g  25102
## 12:  Doritos  24962
## 13:  Crinkle  23960
## 14:     110g  22387
## 15:     Corn  22061
## 16: Original  21560
## 17:      Cut  20754
## 18:     Chip  18645
## 19:     170g  18502
## 20:    Salsa  18094
## 21:  Chicken  15407
## 22:     165g  15297
## 23:      Sea  14145
## 24:    Thins  14075
## 25:     Sour  13882
##        words  count
chip_keywords <- c("chips", "crisps", "potato", "snack", "pringles", "lays", "nacho", "tortilla", "crispy")

non_chip_products <- transactionData[!grepl(paste(chip_keywords, collapse = "|"), PROD_NAME, ignore.case = TRUE)]

head(non_chip_products)
##          DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
##        <Date>     <int>          <int>  <int>    <int>
## 1: 2018-10-17         1           1000      1        5
## 2: 2018-08-17         2           2373    974       69
## 3: 2019-05-19         4           4074   2982       57
## 4: 2019-05-16         4           4196   3539       24
## 5: 2018-08-20         5           5026   4525       42
## 6: 2018-08-18         7           7150   6900       52
##                                   PROD_NAME PROD_QTY TOT_SALES
##                                      <char>    <int>     <num>
## 1:   Natural Chip        Compny SeaSalt175g        2       6.0
## 2:   Smiths Chip Thinly  S/Cream&Onion 175g        5      15.0
## 3: Old El Paso Salsa   Dip Tomato Mild 300g        1       5.1
## 4:    Grain Waves         Sweet Chilli 210g        1       3.6
## 5:  Doritos Corn Chip Mexican Jalapeno 150g        1       3.9
## 6:    Grain Waves Sour    Cream&Chives 210G        2       7.2
# Salsa products are not relevant to chips spending habits
# Adding a column to identify rows with 'salsa'
transactionData[, SALSA := grepl('salsa', tolower(PROD_NAME))]

# Removing rows with 'salsa' and the SALSA column
transactionData <- transactionData[SALSA == FALSE][, SALSA := NULL]
# New features (brand name, pack size)

transactionData[, BRAND := word(transactionData$PROD_NAME, 1)]

unique(transactionData[, BRAND])
##  [1] "Natural"    "CCs"        "Smiths"     "Kettle"     "Grain"     
##  [6] "Doritos"    "Twisties"   "WW"         "Thins"      "Burger"    
## [11] "NCC"        "Cheezels"   "Infzns"     "Red"        "Pringles"  
## [16] "Dorito"     "Infuzions"  "Smith"      "GrnWves"    "Tyrrells"  
## [21] "Cobs"       "French"     "RRD"        "Tostitos"   "Cheetos"   
## [26] "Woolworths" "Snbts"      "Sunbites"

Standardise brand names

There are brands in this dataset with very similar names. By standardising brand names, we can acquire more accurate insights e.g. concerning sales of Doritos

transactionData[BRAND == "Red", BRAND := "RRD"]
transactionData[BRAND == "Infzns", BRAND := "Infuzions"]
transactionData[BRAND == "Snbts", BRAND := "Sunbites"]
transactionData[BRAND == "Smith", BRAND := "Smiths"]
transactionData[BRAND == "Dorito", BRAND := "Doritos"]

unique(transactionData[, BRAND])
##  [1] "Natural"    "CCs"        "Smiths"     "Kettle"     "Grain"     
##  [6] "Doritos"    "Twisties"   "WW"         "Thins"      "Burger"    
## [11] "NCC"        "Cheezels"   "Infuzions"  "RRD"        "Pringles"  
## [16] "GrnWves"    "Tyrrells"   "Cobs"       "French"     "Tostitos"  
## [21] "Cheetos"    "Woolworths" "Sunbites"
# Pack Size column

transactionData[, PACK_SIZE := parse_number(transactionData$PROD_NAME)]
# Inspecting sales over time

transactions_day <- transactionData[, .(Count = .N), by = DATE][order(DATE)]
transactions_day
##            DATE Count
##          <Date> <int>
##   1: 2018-07-01   663
##   2: 2018-07-02   650
##   3: 2018-07-03   674
##   4: 2018-07-04   669
##   5: 2018-07-05   660
##  ---                 
## 360: 2019-06-26   657
## 361: 2019-06-27   669
## 362: 2019-06-28   673
## 363: 2019-06-29   703
## 364: 2019-06-30   704
# There are only 364 rows, what day has 0 orders?

dates <- data.table(DATE = seq(as.Date('2018-07-01'), as.Date('2019-06-30'), 'day'))
dates_count <- dates[, .(Count = .N), by = DATE]
dates_count
##            DATE Count
##          <Date> <int>
##   1: 2018-07-01     1
##   2: 2018-07-02     1
##   3: 2018-07-03     1
##   4: 2018-07-04     1
##   5: 2018-07-05     1
##  ---                 
## 361: 2019-06-26     1
## 362: 2019-06-27     1
## 363: 2019-06-28     1
## 364: 2019-06-29     1
## 365: 2019-06-30     1
merged_dates <- transactions_day[dates, on = .(DATE)]
merged_dates[is.na(Count), DATE] # date of day with no orders
## [1] "2018-12-25"
merged_dates[is.na(Count), Count := 0]
merged_dates
##            DATE Count
##          <Date> <int>
##   1: 2018-07-01   663
##   2: 2018-07-02   650
##   3: 2018-07-03   674
##   4: 2018-07-04   669
##   5: 2018-07-05   660
##  ---                 
## 361: 2019-06-26   657
## 362: 2019-06-27   669
## 363: 2019-06-28   673
## 364: 2019-06-29   703
## 365: 2019-06-30   704
# Plot orders over time

ggplot(transactions_day, aes(x = DATE, y = Count)) +
  geom_line() +
  scale_x_date(breaks = "1 month") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# Peak in December

dec_dates <- transactions_day[DATE > '2018-11-30' & DATE < '2019-01-01']
# dec_dates

ggplot(dec_dates, aes(x = DATE, y = Count)) +
  geom_line() +                         # creates line graph
  scale_x_date(breaks = '1 day') +      # scale of x-axis
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, size = 7.5)) +
  labs(x = 'Date of Order', y = 'Number of Orders', title = 'Orders over December')

Customer Data

str(customerData)       # checking column data types
## Classes 'data.table' and 'data.frame':   72637 obs. of  3 variables:
##  $ LYLTY_CARD_NBR  : int  1000 1002 1003 1004 1005 1007 1009 1010 1011 1012 ...
##  $ LIFESTAGE       : chr  "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG FAMILIES" "OLDER SINGLES/COUPLES" ...
##  $ PREMIUM_CUSTOMER: chr  "Premium" "Mainstream" "Budget" "Mainstream" ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(customerData)   # check for null values/outliers
##  LYLTY_CARD_NBR     LIFESTAGE         PREMIUM_CUSTOMER  
##  Min.   :   1000   Length:72637       Length:72637      
##  1st Qu.:  66202   Class :character   Class :character  
##  Median : 134040   Mode  :character   Mode  :character  
##  Mean   : 136186                                        
##  3rd Qu.: 203375                                        
##  Max.   :2373711
# Lifestage Distribution

lifestageCount <- customerData[, .(Count = .N), by = LIFESTAGE]
lifestageCount
##                 LIFESTAGE Count
##                    <char> <int>
## 1:  YOUNG SINGLES/COUPLES 14441
## 2:         YOUNG FAMILIES  9178
## 3:  OLDER SINGLES/COUPLES 14609
## 4: MIDAGE SINGLES/COUPLES  7275
## 5:           NEW FAMILIES  2549
## 6:         OLDER FAMILIES  9780
## 7:               RETIREES 14805
# Distribution of Life stages plot

ggplot(lifestageCount, aes(x = '', y = Count, fill = LIFESTAGE)) +
  geom_bar(stat = 'identity') + 
  coord_polar('y') +  # converts bar chart to pie chart
  theme_void() +      # clearer format
  geom_text(aes(label = Count), 
            position = position_stack(vjust = 0.5), # positions label centrally
            size = 2) +
  labs(title = "Distribution of Life Stages")

# Premium Distribution
premiumCount <- customerData[, .(Count = .N), by = PREMIUM_CUSTOMER]
premiumCount
##    PREMIUM_CUSTOMER Count
##              <char> <int>
## 1:          Premium 18922
## 2:       Mainstream 29245
## 3:           Budget 24470
ggplot(premiumCount, aes(x = "", y = Count, fill = PREMIUM_CUSTOMER)) +
  geom_bar(stat = 'identity') +
  coord_polar("y") +
  theme_void() +
  geom_text(aes(label = paste0(PREMIUM_CUSTOMER, "\n", Count)),
            position = position_stack(vjust = 0.5)) +
  labs(title = "Distribution of Spending Profiles",
       fill = 'Spending Profile')

# Merging the two data sets

data <- customerData[transactionData, on = .(LYLTY_CARD_NBR)]
summary(data)
##  LYLTY_CARD_NBR     LIFESTAGE         PREMIUM_CUSTOMER        DATE           
##  Min.   :   1000   Length:246740      Length:246740      Min.   :2018-07-01  
##  1st Qu.:  70015   Class :character   Class :character   1st Qu.:2018-09-30  
##  Median : 130367   Mode  :character   Mode  :character   Median :2018-12-30  
##  Mean   : 135530                                         Mean   :2018-12-30  
##  3rd Qu.: 203083                                         3rd Qu.:2019-03-31  
##  Max.   :2373711                                         Max.   :2019-06-30  
##    STORE_NBR         TXN_ID           PROD_NBR       PROD_NAME        
##  Min.   :  1.0   Min.   :      1   Min.   :  1.00   Length:246740     
##  1st Qu.: 70.0   1st Qu.:  67569   1st Qu.: 26.00   Class :character  
##  Median :130.0   Median : 135182   Median : 53.00   Mode  :character  
##  Mean   :135.1   Mean   : 135130   Mean   : 56.35                     
##  3rd Qu.:203.0   3rd Qu.: 202652   3rd Qu.: 87.00                     
##  Max.   :272.0   Max.   :2415841   Max.   :114.00                     
##     PROD_QTY       TOT_SALES         BRAND             PACK_SIZE    
##  Min.   :1.000   Min.   : 1.700   Length:246740      Min.   : 70.0  
##  1st Qu.:2.000   1st Qu.: 5.800   Class :character   1st Qu.:150.0  
##  Median :2.000   Median : 7.400   Mode  :character   Median :170.0  
##  Mean   :1.906   Mean   : 7.316                      Mean   :175.6  
##  3rd Qu.:2.000   3rd Qu.: 8.800                      3rd Qu.:175.0  
##  Max.   :5.000   Max.   :29.500                      Max.   :380.0
# Move DATE column to the front

setcolorder(data, c("DATE", setdiff(names(data), "DATE")))
names(data)
##  [1] "DATE"             "LYLTY_CARD_NBR"   "LIFESTAGE"        "PREMIUM_CUSTOMER"
##  [5] "STORE_NBR"        "TXN_ID"           "PROD_NBR"         "PROD_NAME"       
##  [9] "PROD_QTY"         "TOT_SALES"        "BRAND"            "PACK_SIZE"
# Variable Summaries ------------------------------------------------------

# Brands by Life Stage

unique(data[, BRAND])
##  [1] "Natural"    "CCs"        "Smiths"     "Kettle"     "Grain"     
##  [6] "Doritos"    "Twisties"   "WW"         "Thins"      "Burger"    
## [11] "NCC"        "Cheezels"   "Infuzions"  "RRD"        "Pringles"  
## [16] "GrnWves"    "Tyrrells"   "Cobs"       "French"     "Tostitos"  
## [21] "Cheetos"    "Woolworths" "Sunbites"
lifestage_brand <- data[, .(Count = .N), by = .(LIFESTAGE, BRAND)][order(-Count)]
lifestage_brand[order(LIFESTAGE)]
##                   LIFESTAGE      BRAND Count
##                      <char>     <char> <int>
##   1: MIDAGE SINGLES/COUPLES     Kettle  4055
##   2: MIDAGE SINGLES/COUPLES     Smiths  2790
##   3: MIDAGE SINGLES/COUPLES    Doritos  2423
##   4: MIDAGE SINGLES/COUPLES   Pringles  2389
##   5: MIDAGE SINGLES/COUPLES        RRD  1478
##  ---                                        
## 157:  YOUNG SINGLES/COUPLES     French   194
## 158:  YOUNG SINGLES/COUPLES Woolworths   192
## 159:  YOUNG SINGLES/COUPLES     Burger   178
## 160:  YOUNG SINGLES/COUPLES    GrnWves   163
## 161:  YOUNG SINGLES/COUPLES        NCC   158
brandCount <- data[, .(Count = .N), by = BRAND][order(-Count)]
brandCount
##          BRAND Count
##         <char> <int>
##  1:     Kettle 41288
##  2:     Smiths 30353
##  3:    Doritos 25224
##  4:   Pringles 25102
##  5:        RRD 16321
##  6:  Infuzions 14201
##  7:      Thins 14075
##  8:         WW 10320
##  9:       Cobs  9693
## 10:   Tostitos  9471
## 11:   Twisties  9454
## 12:   Tyrrells  6442
## 13:      Grain  6272
## 14:    Natural  6050
## 15:   Cheezels  4603
## 16:        CCs  4551
## 17:   Sunbites  3008
## 18:    Cheetos  2927
## 19:     Burger  1564
## 20: Woolworths  1516
## 21:    GrnWves  1468
## 22:        NCC  1419
## 23:     French  1418
##          BRAND Count
# Stacked Bar Chart, lifestage and brand
ggplot(lifestage_brand, aes(x = BRAND, y = Count, fill = LIFESTAGE)) +
  geom_bar(stat = 'identity') + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

most purchased brands are Kettle, Smiths and Doritos

lifestage_brand[BRAND %in% c('Kettle', 'Smiths', 'Doritos')][order(BRAND)]
##                  LIFESTAGE   BRAND Count
##                     <char>  <char> <int>
##  1:  OLDER SINGLES/COUPLES Doritos  5326
##  2:               RETIREES Doritos  4987
##  3:         OLDER FAMILIES Doritos  4218
##  4:         YOUNG FAMILIES Doritos  3894
##  5:  YOUNG SINGLES/COUPLES Doritos  3650
##  6: MIDAGE SINGLES/COUPLES Doritos  2423
##  7:           NEW FAMILIES Doritos   726
##  8:  OLDER SINGLES/COUPLES  Kettle  8847
##  9:               RETIREES  Kettle  8194
## 10:         OLDER FAMILIES  Kettle  6851
## 11:         YOUNG FAMILIES  Kettle  6277
## 12:  YOUNG SINGLES/COUPLES  Kettle  5893
## 13: MIDAGE SINGLES/COUPLES  Kettle  4055
## 14:           NEW FAMILIES  Kettle  1171
## 15:         OLDER FAMILIES  Smiths  6138
## 16:  OLDER SINGLES/COUPLES  Smiths  6032
## 17:         YOUNG FAMILIES  Smiths  5399
## 18:               RETIREES  Smiths  5374
## 19:  YOUNG SINGLES/COUPLES  Smiths  3893
## 20: MIDAGE SINGLES/COUPLES  Smiths  2790
## 21:           NEW FAMILIES  Smiths   727
##                  LIFESTAGE   BRAND Count
# Percentage of Sales by Life stage -----------------------------------------------------

sales_lifestage <- data[, .(sum = round(sum(TOT_SALES))), by = LIFESTAGE]
sales_lifestage <- sales_lifestage %>% 
  mutate(Percentage = sum / sum(sum) * 100)

ggplot(sales_lifestage, aes(x = LIFESTAGE, y = Percentage, fill = LIFESTAGE)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_text(aes(label = round(Percentage, 2)),
            position = position_stack(vjust = 0.5),
            size = 3,
            show.legend = FALSE) +
  labs(title = 'Sales by Life stage') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

OLDER SINGLES/COUPLES, RETIREES and OLDER FAMILIES are the greatest contributors to total sales.

Customer Segmentation —————————————————

Customer Segment #1 - by Life stage and Spending Profile

lifestage_profile <- data[, .(Count = .N), by = .(LIFESTAGE, PREMIUM_CUSTOMER)][order(-Count)]

lifestage_profile
##                  LIFESTAGE PREMIUM_CUSTOMER Count
##                     <char>           <char> <int>
##  1:         OLDER FAMILIES           Budget 21514
##  2:               RETIREES       Mainstream 19970
##  3:  YOUNG SINGLES/COUPLES       Mainstream 19544
##  4:         YOUNG FAMILIES           Budget 17763
##  5:  OLDER SINGLES/COUPLES           Budget 17172
##  6:  OLDER SINGLES/COUPLES       Mainstream 17061
##  7:  OLDER SINGLES/COUPLES          Premium 16560
##  8:               RETIREES           Budget 14225
##  9:         OLDER FAMILIES       Mainstream 13241
## 10:               RETIREES          Premium 12236
## 11:         YOUNG FAMILIES       Mainstream 11947
## 12: MIDAGE SINGLES/COUPLES       Mainstream 11095
## 13:         YOUNG FAMILIES          Premium 10784
## 14:         OLDER FAMILIES          Premium 10403
## 15:  YOUNG SINGLES/COUPLES           Budget  8573
## 16: MIDAGE SINGLES/COUPLES          Premium  7612
## 17:  YOUNG SINGLES/COUPLES          Premium  5852
## 18: MIDAGE SINGLES/COUPLES           Budget  4691
## 19:           NEW FAMILIES           Budget  2824
## 20:           NEW FAMILIES       Mainstream  2185
## 21:           NEW FAMILIES          Premium  1488
##                  LIFESTAGE PREMIUM_CUSTOMER Count
ggplot(lifestage_profile, aes(x = LIFESTAGE, y = Count, fill = PREMIUM_CUSTOMER)) +
  geom_bar(stat = 'identity') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = Count),
            position = position_stack(vjust = 0.5),
            size = 3) +
  labs(title = "Distribution by Lifestage and Spending Profile",
       x = 'Life stage of Customer',
       y = 'Number of customers',
       fill = 'Spending Profile')

# Sales metrics by Life Stage and Spending Profile
segment_metrics <- data[, .(
  avg_spend = mean(TOT_SALES),
  total_sales = sum(TOT_SALES),
  avg_order_size = mean(PROD_QTY)
), by = .(LIFESTAGE, PREMIUM_CUSTOMER)]

Which lifestage and spending profile segment is the largest contributor to sales?

high_value_segments <- segment_metrics[, .(
  LIFESTAGE,
  PREMIUM_CUSTOMER,
  Contribution = round(total_sales / sum(total_sales) * 100, 2))][order(-Contribution)]

ggplot(high_value_segments, aes(x = LIFESTAGE, 
                                y = Contribution, 
                                fill = PREMIUM_CUSTOMER)) + 
  geom_bar(stat = 'identity') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  geom_text(aes(label = Contribution),
            position = position_stack(vjust = 0.5)) +
  labs(title = '% Contribution across Lifestage and Spending Profile', 
       fill = 'Spending Profile')

OLDER SINGLES/COUPLES contribute the most () to sales as a Life stage segment
OLDER FAMILIES - BUDGET contribute the most as a Life stage/Spending Profile segment
NEW FAMILIES as a segment are the smallest contributor to sales, contributing less than 3% of sales.