#Load Required Libraries
#### Load required libraries
library(data.table)
library(ggplot2)
library(ggmosaic)
library(readr)
##Load datasets
#Load the data
?getDTthreads
## starting httpd help server ... done
transactionData <- read.csv("C:/Users/ebene/Documents/QVI_transaction_data.csv")
customerData <- read.csv("C:/Users/ebene/Documents/QVI_purchase_behaviour.csv")
##Exploratory data analysis The first step in any analysis is to first understand the data.
head(transactionData)
## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
## 1 43390 1 1000 1 5
## 2 43599 1 1307 348 66
## 3 43605 1 1343 383 61
## 4 43329 2 2373 974 69
## 5 43330 2 2426 1038 108
## 6 43604 4 4074 2982 57
## PROD_NAME PROD_QTY TOT_SALES
## 1 Natural Chip Compny SeaSalt175g 2 6.0
## 2 CCs Nacho Cheese 175g 3 6.3
## 3 Smiths Crinkle Cut Chips Chicken 170g 2 2.9
## 4 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0
## 5 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8
## 6 Old El Paso Salsa Dip Tomato Mild 300g 1 5.1
str(transactionData)
## 'data.frame': 264836 obs. of 8 variables:
## $ DATE : int 43390 43599 43605 43329 43330 43604 43601 43601 43332 43330 ...
## $ STORE_NBR : int 1 1 1 2 2 4 4 4 5 7 ...
## $ LYLTY_CARD_NBR: int 1000 1307 1343 2373 2426 4074 4149 4196 5026 7150 ...
## $ TXN_ID : int 1 348 383 974 1038 2982 3333 3539 4525 6900 ...
## $ PROD_NBR : int 5 66 61 69 108 57 16 24 42 52 ...
## $ PROD_NAME : chr "Natural Chip Compny SeaSalt175g" "CCs Nacho Cheese 175g"
"Smiths Crinkle Cut Chips Chicken 170g" "Smiths Chip Thinly S/Cream&Onion 175g"
...
## $ PROD_QTY : int 2 3 2 5 3 1 1 1 1 2 ...
## $ TOT_SALES : num 6 6.3 2.9 15 13.8 5.1 5.7 3.6 3.9 7.2 ...
##Convert DATE column to a date format
#### A quick search online tells us that CSV and Excel integer dates begin on 30
#Dec 1899
transactionData$DATE <- as.Date(transactionData$DATE, origin = "1899-12-30")
#Examine PROD_NAME
setDT(transactionData)
transactionData[, .N , PROD_NAME]
## PROD_NAME N
## 1: Natural Chip Compny SeaSalt175g 1468
## 2: CCs Nacho Cheese 175g 1498
## 3: Smiths Crinkle Cut Chips Chicken 170g 1484
## 4: Smiths Chip Thinly S/Cream&Onion 175g 1473
## 5: Kettle Tortilla ChpsHny&Jlpno Chili 150g 3296
## ---
## 110: Red Rock Deli Chikn&Garlic Aioli 150g 1434
## 111: RRD SR Slow Rst Pork Belly 150g 1526
## 112: RRD Pc Sea Salt 165g 1431
## 113: Smith Crinkle Cut Bolognese 150g 1451
## 114: Doritos Salsa Mild 300g 1472
##Text Analysis
prodWords <- data.table(unlist(strsplit(unique(transactionData[, PROD_NAME]), " ")))
setnames(prodWords, 'words')
##Remove all words with digits and special characters
prodWords <- prodWords[grepl("\\d", words) == FALSE, ] #remove special digits
prodWords <- prodWords[grepl("[:alpha:]" , words), ] #remove special characters
##count frequency of words
prodWords[, .N, words][order(N, decreasing = TRUE)]
## words N
## 1: Chips 21
## 2: Smiths 16
## 3: Crinkle 14
## 4: Kettle 13
## 5: Cheese 12
## ---
## 127: Chikn&Garlic 1
## 128: Aioli 1
## 129: Slow 1
## 130: Belly 1
## 131: Bolognese 1
##As we are only interested in chips, so we will remove salsa products.
transactionData[, SALSA := grepl("salsa", tolower(PROD_NAME))]
transactionData<- transactionData[SALSA == FALSE, ][, SALSA := NULL]
##Summary of Data to Check Nulls and Outliers
summary(transactionData)
## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID
## Min. :2018-07-01 Min. : 1.0 Min. : 1000 Min. : 1
## 1st Qu.:2018-09-30 1st Qu.: 70.0 1st Qu.: 70015 1st Qu.: 67569
## Median :2018-12-30 Median :130.0 Median : 130367 Median : 135183
## Mean :2018-12-30 Mean :135.1 Mean : 135531 Mean : 135131
## 3rd Qu.:2019-03-31 3rd Qu.:203.0 3rd Qu.: 203084 3rd Qu.: 202654
## Max. :2019-06-30 Max. :272.0 Max. :2373711 Max. :2415841
## PROD_NBR PROD_NAME PROD_QTY TOT_SALES
## Min. : 1.00 Length:246742 Min. : 1.000 Min. : 1.700
## 1st Qu.: 26.00 Class :character 1st Qu.: 2.000 1st Qu.: 5.800
## Median : 53.00 Mode :character Median : 2.000 Median : 7.400
## Mean : 56.35 Mean : 1.908 Mean : 7.321
## 3rd Qu.: 87.00 3rd Qu.: 2.000 3rd Qu.: 8.800
## Max. :114.00 Max. :200.000 Max. :650.000
transactionData[PROD_QTY == 200, ]
## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
## 1: 2018-08-19 226 226000 226201 4
## 2: 2019-05-20 226 226000 226210 4
## PROD_NAME PROD_QTY TOT_SALES
## 1: Dorito Corn Chp Supreme 380g 200 650
## 2: Dorito Corn Chp Supreme 380g 200 650
transactionData[LYLTY_CARD_NBR == 226000, ]
## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR
## 1: 2018-08-19 226 226000 226201 4
## 2: 2019-05-20 226 226000 226210 4
## PROD_NAME PROD_QTY TOT_SALES
## 1: Dorito Corn Chp Supreme 380g 200 650
## 2: Dorito Corn Chp Supreme 380g 200 650
Filter out the customer based on loyalty card number
transactionData <- transactionData[LYLTY_CARD_NBR != 226000, ]
#Summary of data
summary(transactionData)
## DATE STORE_NBR LYLTY_CARD_NBR TXN_ID
## Min. :2018-07-01 Min. : 1.0 Min. : 1000 Min. : 1
## 1st Qu.:2018-09-30 1st Qu.: 70.0 1st Qu.: 70015 1st Qu.: 67569
## Median :2018-12-30 Median :130.0 Median : 130367 Median : 135182
## Mean :2018-12-30 Mean :135.1 Mean : 135530 Mean : 135130
## 3rd Qu.:2019-03-31 3rd Qu.:203.0 3rd Qu.: 203083 3rd Qu.: 202652
## Max. :2019-06-30 Max. :272.0 Max. :2373711 Max. :2415841
## PROD_NBR PROD_NAME PROD_QTY TOT_SALES
## Min. : 1.00 Length:246740 Min. :1.000 Min. : 1.700
## 1st Qu.: 26.00 Class :character 1st Qu.:2.000 1st Qu.: 5.800
## Median : 53.00 Mode :character Median :2.000 Median : 7.400
## Mean : 56.35 Mean :1.906 Mean : 7.316
## 3rd Qu.: 87.00 3rd Qu.:2.000 3rd Qu.: 8.800
## Max. :114.00 Max. :5.000 Max. :29.500
##number of transaction by date
#number of transaction by date
transactionData[, .N, by= DATE]
## DATE N
## 1: 2018-10-17 682
## 2: 2019-05-14 705
## 3: 2019-05-20 707
## 4: 2018-08-17 663
## 5: 2018-08-18 683
## ---
## 360: 2018-12-08 622
## 361: 2019-01-30 689
## 362: 2019-02-09 671
## 363: 2018-08-31 658
## 364: 2019-02-12 684
##create a sequence of dates
eachdate <- data.table(seq(as.Date("2018/07/01"), as.Date("2019/06/30"), by = "day"))
setnames(eachdate, "DATE")
byday_trans <- merge(eachdate, transactionData[, .N, by = DATE], all.x = TRUE)
#SETTING PLOT THEMES TO FORMAT GRAPHS
#SETTING PLOT THEMES TO FORMAT GRAPHS
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
# over time transactions plot
ggplot(byday_trans, aes(x= DATE, y = N)) + geom_line() +
labs(x = "DAY", y = "Number of Transactions", title = "Transactions over Time") +
scale_x_date(breaks = "1 month") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
#narrow down number of days
ggplot(byday_trans[month(DATE) == 12, ], aes(x = DATE, y = N)) +
geom_line() +
labs(x = "Day", y = "Number of transactions", title = "Transactions over December") +
scale_x_date(breaks = "1 day") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
##This graph shows that higher sales were in December, there was a
linear upward trend from 15th to 19th, and from 21st to 23rd. Christmas
is responsiable for this trand. 25th, 2018 is missing
##pack size of Chips
#pack size
transactionData[, PACK_SIZE := parse_number(PROD_NAME)]
transactionData[, .N, PACK_SIZE][order(PACK_SIZE)]
## PACK_SIZE N
## 1: 70 1507
## 2: 90 3008
## 3: 110 22387
## 4: 125 1454
## 5: 134 25102
## 6: 135 3257
## 7: 150 40203
## 8: 160 2970
## 9: 165 15297
## 10: 170 19983
## 11: 175 66390
## 12: 180 1468
## 13: 190 2995
## 14: 200 4473
## 15: 210 6272
## 16: 220 1564
## 17: 250 3169
## 18: 270 6285
## 19: 330 12540
## 20: 380 6416
##pack size of Chips
#pack size
transactionData[, PACK_SIZE := parse_number(PROD_NAME)]
transactionData[, .N, PACK_SIZE][order(PACK_SIZE)]
## PACK_SIZE N
## 1: 70 1507
## 2: 90 3008
## 3: 110 22387
## 4: 125 1454
## 5: 134 25102
## 6: 135 3257
## 7: 150 40203
## 8: 160 2970
## 9: 165 15297
## 10: 170 19983
## 11: 175 66390
## 12: 180 1468
## 13: 190 2995
## 14: 200 4473
## 15: 210 6272
## 16: 220 1564
## 17: 250 3169
## 18: 270 6285
## 19: 330 12540
## 20: 380 6416
##plot histogram
hist (transactionData$PACK_SIZE)
#create BRANDS using first word in PROD_NAME
transactionData[, BRAND := toupper(substr(PROD_NAME, 1, regexpr(pattern = ' ', PROD_NAME) -1))]
transactionData[, .N, by = BRAND][order(-N)]
## BRAND N
## 1: KETTLE 41288
## 2: SMITHS 27390
## 3: PRINGLES 25102
## 4: DORITOS 22041
## 5: THINS 14075
## 6: RRD 11894
## 7: INFUZIONS 11057
## 8: WW 10320
## 9: COBS 9693
## 10: TOSTITOS 9471
## 11: TWISTIES 9454
## 12: TYRRELLS 6442
## 13: GRAIN 6272
## 14: NATURAL 6050
## 15: CHEEZELS 4603
## 16: CCS 4551
## 17: RED 4427
## 18: DORITO 3183
## 19: INFZNS 3144
## 20: SMITH 2963
## 21: CHEETOS 2927
## 22: SNBTS 1576
## 23: BURGER 1564
## 24: WOOLWORTHS 1516
## 25: GRNWVES 1468
## 26: SUNBITES 1432
## 27: NCC 1419
## 28: FRENCH 1418
## BRAND N
#combine same brand names
transactionData[BRAND == "RED", BRAND := "RRD"]
transactionData[BRAND == "SNBTS", BRAND := "SUNBITES"]
transactionData[BRAND == "INFZNS", BRAND := "INFUZIONS"]
transactionData[BRAND == "WW", BRAND := "WOOLWORTHS"]
transactionData[BRAND == "SMITH", BRAND := "SMITHS"]
transactionData[BRAND == "NCC", BRAND := "NATURAL"]
transactionData[BRAND == "DORITO", BRAND := "DORITOS"]
transactionData[BRAND == "GRAIN", BRAND := "GRNWVES"]
transactionData[, .N, by = BRAND][order(N)]
## BRAND N
## 1: FRENCH 1418
## 2: BURGER 1564
## 3: CHEETOS 2927
## 4: SUNBITES 3008
## 5: CCS 4551
## 6: CHEEZELS 4603
## 7: TYRRELLS 6442
## 8: NATURAL 7469
## 9: GRNWVES 7740
## 10: TWISTIES 9454
## 11: TOSTITOS 9471
## 12: COBS 9693
## 13: WOOLWORTHS 11836
## 14: THINS 14075
## 15: INFUZIONS 14201
## 16: RRD 16321
## 17: PRINGLES 25102
## 18: DORITOS 25224
## 19: SMITHS 30353
## 20: KETTLE 41288
Examining Customer Data
str(customerData)
## 'data.frame': 72637 obs. of 3 variables:
## $ LYLTY_CARD_NBR : int 1000 1002 1003 1004 1005 1007 1009 1010 1011 1012 ...
## $ LIFESTAGE : chr "YOUNG SINGLES/COUPLES" "YOUNG SINGLES/COUPLES" "YOUNG
FAMILIES" "OLDER SINGLES/COUPLES" ...
## $ PREMIUM_CUSTOMER: chr "Premium" "Mainstream" "Budget" "Mainstream" ...
summary(customerData)
## LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
## Min. : 1000 Length:72637 Length:72637
## 1st Qu.: 66202 Class :character Class :character
## Median : 134040 Mode :character Mode :character
## Mean : 136186
## 3rd Qu.: 203375
## Max. :2373711
###Examine Values of Lifestage & Premium Customer
setDT(customerData)
customerData[,.N, by= LIFESTAGE][order(-N)]
## LIFESTAGE N
## 1: RETIREES 14805
## 2: OLDER SINGLES/COUPLES 14609
## 3: YOUNG SINGLES/COUPLES 14441
## 4: OLDER FAMILIES 9780
## 5: YOUNG FAMILIES 9178
## 6: MIDAGE SINGLES/COUPLES 7275
## 7: NEW FAMILIES 2549
customerData[, .N, by= PREMIUM_CUSTOMER][order(-N)]
## PREMIUM_CUSTOMER N
## 1: Mainstream 29245
## 2: Budget 24470
## 3: Premium 18922
##Merge transdata and purchasebehaviour data
mergdata <- merge(transactionData, customerData, all.x = TRUE)
#check if some customers were not matched on by checking for nulls.
mergdata[is.null(LIFESTAGE), .N]
## [1] 0
mergdata[is.null(PREMIUM_CUSTOMER), .N]
## [1] 0
##BRAVO! DATA EXPLORATION IS COMPLETED. ##Data Analysis on Customer Segmentation
##Total sales by LIFESTAGE & PREMIUM_CUSTOMERS
setDT(mergdata)
sales <- mergdata[, .(SALES = sum(TOT_SALES)), .(LIFESTAGE, PREMIUM_CUSTOMER)]
##Create a plot
plot <- ggplot(data = sales) +
geom_mosaic(aes(weight = SALES, x= product(PREMIUM_CUSTOMER, LIFESTAGE), fill = PREMIUM_CUSTOMER)) +
labs(x= "Lifestage", y= "Premium Customer Flag", title = "Proportion of Sales") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
#Plot and Lable with Proportion of Sales
#Plot and Lable with Proportion of Sales
plot + geom_text(data = ggplot_build(plot)$data[[1]], aes(x = (xmin + xmax)/2 , y =
(ymin + ymax)/2, label = as.character(paste(round(.wt/sum(.wt),3)*100,
'%'))))
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
## Please report the issue at <]8;;https://github.com/haleyjeppson/ggmosaichttps://github.com/haleyjeppson/ggmosaic]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Let’s see if the higher sales are due to more customers who buy chips.
##Number of customers by LIFESTAGE and PREMIUM_CUSTOMER
customers <- mergdata[, .(CUSTOMERS = uniqueN(LYLTY_CARD_NBR)), .(LIFESTAGE, PREMIUM_CUSTOMER)][order(-CUSTOMERS)]
#create a plot
plot_2 <- ggplot(data = customers) +
geom_mosaic(aes(weight = CUSTOMERS, x = product(PREMIUM_CUSTOMER, LIFESTAGE), fill = PREMIUM_CUSTOMER)) +
labs(x = "Lifestage", y = "Premium customer flag", title = "Proportion of customers") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
##plot and label with proportion of customers
plot_2 + geom_text(data = ggplot_build(plot_2)$data[[1]], aes(x = (xmin + xmax)/2 , y =
(ymin + ymax)/2, label = as.character(paste(round(.wt/sum(.wt),3)*100,
'%'))))
quantity_segment1_by_pack <- segment1[, .(targetSegment = sum(PROD_QTY)/quantity_segment1), by = PACK_SIZE]
quantity_other_by_pack <- other[, .(other = sum(PROD_QTY)/quantity_other), by = PACK_SIZE]
pack_proportions <- merge(quantity_segment1_by_pack, quantity_other_by_pack)[, affinityToPack := targetSegment/other]
pack_proportions[order(-affinityToPack)]
## PACK_SIZE targetSegment other affinityToPack
## 1: 270 0.031828847 0.025095929 1.2682873
## 2: 380 0.032160110 0.025584213 1.2570295
## 3: 330 0.061283644 0.050161917 1.2217166
## 4: 134 0.119420290 0.100634769 1.1866703
## 5: 110 0.106280193 0.089791190 1.1836372
## 6: 210 0.029123533 0.025121265 1.1593180
## 7: 135 0.014768806 0.013075403 1.1295106
## 8: 250 0.014354727 0.012780590 1.1231662
## 9: 170 0.080772947 0.080985964 0.9973697
## 10: 150 0.157598344 0.163420656 0.9643722
## 11: 175 0.254989648 0.270006956 0.9443818
## 12: 165 0.055652174 0.062267662 0.8937572
## 13: 190 0.007481021 0.012442016 0.6012708
## 14: 180 0.003588682 0.006066692 0.5915385
## 15: 160 0.006404417 0.012372920 0.5176157
## 16: 90 0.006349206 0.012580210 0.5046980
## 17: 125 0.003008972 0.006036750 0.4984423
## 18: 200 0.008971705 0.018656115 0.4808989
## 19: 70 0.003036577 0.006322350 0.4802924
## 20: 220 0.002926156 0.006596434 0.4435967
##Mainstream young singles/couples are 27% more likely to purchase a 270g pack of ##chips compared to the rest of the population
mergdata[PACK_SIZE == 270, unique(PROD_NAME)]
## [1] "Twisties Cheese 270g" "Twisties Chicken270g"
##INSIGHTS Twisties are the only brand offering 270g packs and this could be reflecting a higher likelihood of purchasing Twisties, because there is no other chioce of that size
Sales have mainly been due to Budget - older families, Mainstream - young singles/couples, and Mainstream - retirees shoppers. We found that the high spend in chips for mainstream young singles/couples and retirees is due to there being more of them than other buyers.
Mainstream, midage and young singles and couples are also more likely to pay more per packet of chips. This is indicative of impulse buying behaviour. We’ve also found that Mainstream young singles and couples are 23% more likely to purchase Tyrrells chips compared to the rest of the population. The Category Manager may want to increase the category’s performance by off-locating some Tyrrells and smaller packs of chips in discretionary space near segments where young singles and couples frequent more often to increase visibilty and impulse behaviour.