# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
#load package
library(gridExtra)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gridExtra)
library(janitor) # for tyble
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lmPerm) # for ANOVA
library(formattable)# For table formatting and table formatting functions
library(htmltools)
#Data Loading
my_data <- read.csv("mtp_data.csv")
# Set Factor
my_data$promo <- as.factor(my_data$promo)
my_data$iri_key <- as.factor(my_data$iri_key)
my_data <- my_data %>% mutate(sales = price*units)
# Create Sales Column
my_data <- my_data %>% mutate(sales = price*units)
#create producer column
GM_brand <- c("GENERAL MILLS CHEERIOS", "GENERAL MILLS CINNAMON TST CR","GENERAL MILLS COCOA PUFFS","GENERAL MILLS KIX","GENERAL MILLS LUCKY CHARMS")
K_brand <- c("KELLOGGS COCOA KRISPIES","KELLOGGS FROOT LOOPS","KELLOGGS FROSTED FLAKES","KELLOGGS FROSTED MINI WHEATS","KELLOGGS RAISIN BRAN","KELLOGGS RICE KRISPIES","KELLOGGS SMART START","KELLOGGS SPECIAL K")
P_brand <- c("POST GRAPE NUTS","POST SHREDDED WHEAT")
my_data <- my_data %>% mutate(producer = as.factor(ifelse(brand %in% GM_brand , "GM",
ifelse(brand %in% K_brand, "KELLOGGS","POST" ))))
GM_record <- my_data %>% filter(producer == "GM")
K_record <- my_data %>% filter(producer =="KELLOGGS")
P_record <- my_data %>% filter(producer == "POST")
#view data
head(my_data)
## UPC iri_key week units brand
## 1 00-01-16000-11653 644347 1484 5 GENERAL MILLS CINNAMON TST CR
## 2 00-01-16000-11653 248741 1483 2 GENERAL MILLS CINNAMON TST CR
## 3 00-01-16000-11653 535806 1489 3 GENERAL MILLS CINNAMON TST CR
## 4 00-01-16000-11945 675634 1489 2 GENERAL MILLS CHEERIOS
## 5 00-01-16000-11945 205272 1491 8 GENERAL MILLS CHEERIOS
## 6 00-01-16000-11945 248741 1492 5 GENERAL MILLS CHEERIOS
## flavor package volume price promo ad sales producer
## 1 CINNAMON TOAST BOX 0.06 0.5 0 A 2.5 GM
## 2 CINNAMON TOAST BOX 0.06 0.5 0 NONE 1.0 GM
## 3 CINNAMON TOAST BOX 0.06 0.5 0 NONE 1.5 GM
## 4 TOASTED BOX 0.04 0.5 0 NONE 1.0 GM
## 5 TOASTED BOX 0.04 0.5 0 NONE 4.0 GM
## 6 TOASTED BOX 0.04 0.5 0 NONE 2.5 GM
summary(my_data)
print(paste("Number of UPC:",nrow(my_data %>% group_by(UPC)%>%summarise(n()))))
print(paste("Number of Store :",nrow(my_data %>% group_by(iri_key)%>%summarise(n()))))
print(paste("Number of brand : " ,nrow(my_data %>% group_by(brand)%>%summarise(n()))))
Attributes Analysis
prodcut attributes :
UPC : unique product number (114 kinds)
producer : (3 types)
brand : (15 kinds)
flavor : (5 kinds)
package : (2 types)
Senerio :
week: transit time (52 weeks)
iri_key: store number (1420 locations)
Strategy:
promo : (0/1)
ad: none / A / B (3 types)
Outcome :
Unit: certain UPC that a store sells in unit per week
sales; unit * price
Analysis of Whole Cereal Market
price_plot <-
ggplot(my_data,aes(x = price)) +
geom_histogram(binwidth = 0.2) +
labs(title="Price Distribution")
sale_trend <-
my_data %>%
group_by(week)%>%
summarise(summy = n() ) %>%
ggplot(aes(x = week, y = summy)) +
geom_line(stat = "identity")+
labs(title = "Sales Trend")
unit_plot <-
ggplot(my_data,aes(x = units)) +
geom_bar() +
labs(title="Units Distribution")
producer_plot <-
ggplot(my_data, aes(x = producer, y = sum(sales)))+
geom_bar(stat = "identity")+
scale_y_continuous(labels = scales::dollar)+
labs(title = "Accumulated Sales by producer", y = "sales")
flavor_plot <-
my_data %>%
group_by(flavor)%>%
summarise(countty = n()) %>%
ggplot(aes(x = reorder(flavor, -countty), y = countty))+
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::dollar)+
labs(title="Accumulated Sales by flavor", y = "sales")
volumn_plot<-
ggplot(my_data,aes(x = volume))+
geom_freqpoly() +
scale_y_continuous(labels = scales::dollar)+
labs(title="Accumulated Sales by volume", y="sales")
price_plot
sale_trend
unit_plot
producer_plot
flavor_plot
volumn_plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Findings:
Price distribution is right-skewed.
The total sales record is punctuated without a significant pattern.
Per store per UPC sold generally less than 6 units.
Kelloggs sells much better than us for total sales, but we are better than POST.
Customers are looking for regular and toasted cereal.
Customers are looking for a volume between 0.5-1.5.
my_data %>% tabyl(brand,flavor)
## brand CINNAMON TOAST COCOA FRUIT REGULAR TOASTED
## GENERAL MILLS CHEERIOS 0 0 0 4 1454
## GENERAL MILLS CINNAMON TST CR 1834 0 0 0 0
## GENERAL MILLS COCOA PUFFS 0 1020 0 0 0
## GENERAL MILLS KIX 0 0 0 1196 0
## GENERAL MILLS LUCKY CHARMS 0 0 0 3 1678
## KELLOGGS COCOA KRISPIES 0 881 0 0 0
## KELLOGGS FROOT LOOPS 0 0 2192 0 0
## KELLOGGS FROSTED FLAKES 0 0 0 2295 0
## KELLOGGS FROSTED MINI WHEATS 0 0 0 1574 0
## KELLOGGS RAISIN BRAN 0 0 0 1266 0
## KELLOGGS RICE KRISPIES 0 0 0 0 1450
## KELLOGGS SMART START 0 0 0 0 1134
## KELLOGGS SPECIAL K 0 0 0 0 1391
## POST GRAPE NUTS 0 0 0 1289 0
## POST SHREDDED WHEAT 0 0 0 1189 0
Assumption:
grid.arrange(
# market share by sales
my_data
%>%group_by(brand,flavor,producer)
%>% summarise(summy_sales = sum(sales))
%>% ggplot(aes(x = flavor, y = summy_sales, fill = brand))
+ facet_grid(producer~.)
+ geom_bar(stat = "identity", position = "dodge")
+ labs(title = "Brand sale with Flavor by Producer"),
ncol =1
)
Finding: 1. On flavor Regular, we lost a lot of sales. we only have one brand, comparing to Lelloggs who has three. This may be one of the reasons we are losing sales.
grid.arrange(
# market share by sales
my_data
%>%group_by(producer,flavor)
%>% summarise(summy_sales = sum(sales))
%>% ggplot(aes(x = producer, y = summy_sales, fill = flavor))
+ geom_bar(stat = "identity", position = "dodge")
+ labs(title = "Market Share with Flavor by Sales"),
# market share by units
my_data
%>%group_by(producer,flavor)
%>% summarise(summy_units = sum(units))
%>% ggplot(aes(x = producer, y = summy_units, fill = flavor))
+ geom_bar(stat = "identity", position = "dodge")
+ labs(title = "Market Share with Flavo by unit"),
ncol =1
)
Finding : 1. We win in flavor cinnamon, cocoa and toasted. 2. we lost in regular to both competitive brands. The difference between us and POST is not obvious, so we will do a t.test to double-check
Test for the above finidng
# GM with Kelloggs
t.test(my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"],my_data$sales[my_data$producer == "KELLOGGS" & my_data$flavor == "REGULAR"])
##
## Welch Two Sample t-test
##
## data: my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"] and my_data$sales[my_data$producer == "KELLOGGS" & my_data$flavor == my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"] and "REGULAR"]
## t = -10.657, df = 1951.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9.262868 -6.383443
## sample estimates:
## mean of x mean of y
## 26.98179 34.80494
# GM with POST
t.test(my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"],my_data$sales[my_data$producer == "POST" & my_data$flavor == "REGULAR"])
##
## Welch Two Sample t-test
##
## data: my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"] and my_data$sales[my_data$producer == "POST" & my_data$flavor == my_data$sales[my_data$producer == "GM" & my_data$flavor == "REGULAR"] and "REGULAR"]
## t = 5.6018, df = 2113.1, p-value = 2.398e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.757104 5.727343
## sample estimates:
## mean of x mean of y
## 26.98179 22.73956
Test Result :
P_vaule with Kelloggs : < 2.2e-16
P_vaule with POST : 2.398e-08
We lost sales in regular to both competitors.
grid.arrange(
# by flavor
my_data
%>%group_by(flavor,package)
%>% summarise(sum_sales = sum(sales))
%>% ggplot(aes(x = flavor, y = sum_sales, fill = package))
+ geom_bar(stat = "identity", position = "dodge")
+ labs(title = "market share by package"),
# by producer
my_data
%>%group_by(producer,package)
%>% summarise(sum_units = sum(units))
%>% ggplot(aes(x = producer, y = sum_units, fill = package))
+ geom_bar(stat = "identity", position = "dodge")
+ labs(title = "market share by package"),
ncol =1
)
Finding :
grid.arrange(
my_data %>%
group_by(flavor, producer) %>%
summarise( unit_prcie = median( price/volume)) %>%
ggplot(aes(x = producer, y = unit_prcie, fill = flavor)) +
geom_bar(stat = "identity",position = "dodge"),
my_data %>%
group_by(brand,producer,flavor) %>%
mutate(unit_prcie = (price/volume)) %>%
ggplot(aes(x = producer, y = unit_prcie,fill=flavor))+
geom_boxplot(),
ncol = 1
)
Finding :
For the promotion analysis, we will analyze both sales and units. Sale increases will bring more revenue, and unit increases will increase market share. Both are executive goals.
grid.arrange(
#By sales
GM_record %>%
group_by(brand, promo)%>%
summarise(mean_sales = mean(sales))%>%
ggplot(aes(x = brand, y = mean_sales, fill = promo))+
geom_bar(stat = "identity", position = "dodge")+
coord_flip()+
labs (title = "Promotion Impact on Sales"),
#By unit
GM_record %>%
group_by(brand, promo)%>%
summarise(mean_unit = mean(units))%>%
ggplot(aes(x = brand, y = mean_unit, fill = promo))+
geom_bar(stat = "identity", position = "dodge")+
coord_flip()+
labs (title = "Promotion Impact on unit"),
ncol = 1
)
Finding :
Promotion does increase the units in all brands, for sales as well except Cheerios.
# 1 GENERAL MILLS CHEERIOS - sale decrease / unit increase
z <- qnorm(0.975)
GM_Cherrios <- my_data%>% filter(brand == "GENERAL MILLS CHEERIOS")
# evaluate promotion with T test
t.test(GM_Cherrios$sales[GM_Cherrios$promo == "0"],GM_Cherrios$sales[GM_Cherrios$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_Cherrios$sales[GM_Cherrios$promo == "0"] and GM_Cherrios$sales[GM_Cherrios$promo == "1"]
## t = 4.0241, df = 331.68, p-value = 7.09e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.948273 11.500092
## sample estimates:
## mean of x mean of y
## 52.34634 44.62216
t.test(GM_Cherrios$units[GM_Cherrios$promo == "0"],GM_Cherrios$units[GM_Cherrios$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_Cherrios$units[GM_Cherrios$promo == "0"] and GM_Cherrios$units[GM_Cherrios$promo == "1"]
## t = -2.7163, df = 299.5, p-value = 0.006985
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.5048813 -0.4002089
## sample estimates:
## mean of x mean of y
## 13.79516 15.24771
# Test visulization
GM_Cherrios %>%
group_by(promo) %>%
summarise(m = median(sales),sd = sd(sales),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = promo,y = m,fill = promo)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
# reuslt
CHEERIOS_TOAST <- c("Decrease","Increase")
Test result :
Sale decrease / Unit increase
# 2 GENERAL MILLS CINNAMON TST CR - sale increase / unit increase
GM_CINNAMON <- my_data%>% filter(brand == "GENERAL MILLS CINNAMON TST CR")
# evaluate promotion with T test
t.test(GM_CINNAMON$sales[GM_CINNAMON$promo == "0"],GM_CINNAMON$sales[GM_CINNAMON$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_CINNAMON$sales[GM_CINNAMON$promo == "0"] and GM_CINNAMON$sales[GM_CINNAMON$promo == "1"]
## t = -2.3776, df = 465.66, p-value = 0.01783
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.2480936 -0.5934813
## sample estimates:
## mean of x mean of y
## 33.59999 37.02077
t.test(GM_CINNAMON$units[GM_CINNAMON$promo == "0"],GM_CINNAMON$units[GM_CINNAMON$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_CINNAMON$units[GM_CINNAMON$promo == "0"] and GM_CINNAMON$units[GM_CINNAMON$promo == "1"]
## t = -10.182, df = 381.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.226214 -3.534405
## sample estimates:
## mean of x mean of y
## 8.033832 12.414141
# Test visulization
GM_CINNAMON %>%
group_by(promo) %>%
summarise(m = median(sales),sd = sd(sales),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = promo,y = m,fill = promo)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
# reuslt
TST_CINNAMON <- c("Increase","Increase")
Test result :
Sale increase / Unit increase
# 3 GENERAL MILLS COCOA PUFFS - sale increase / unit increase
GM_cocoa <- my_data%>% filter(brand == "GENERAL MILLS COCOA PUFFS")
# evaluate promotion with T test
t.test(GM_cocoa$sales[GM_cocoa$promo == "0"],GM_cocoa$sales[GM_cocoa$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_cocoa$sales[GM_cocoa$promo == "0"] and GM_cocoa$sales[GM_cocoa$promo == "1"]
## t = -4.0746, df = 422.34, p-value = 5.507e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.459441 -2.953708
## sample estimates:
## mean of x mean of y
## 24.33817 30.04474
t.test(GM_cocoa$units[GM_cocoa$promo == "0"],GM_cocoa$units[GM_cocoa$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_cocoa$units[GM_cocoa$promo == "0"] and GM_cocoa$units[GM_cocoa$promo == "1"]
## t = -8.4894, df = 345.28, p-value = 6.234e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.481580 -3.419378
## sample estimates:
## mean of x mean of y
## 6.888166 11.338645
# Test visulization
GM_cocoa %>%
group_by(promo) %>%
summarise(m = median(sales),sd = sd(sales),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = promo,y = m,fill = promo)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
# reuslt
PUFFS_COCOA <- c("Increase","Increase")
Test result :
Sale increase / Unit increase
# 4 GENERAL MILLS KIX - sale no differece !! / unit increase
GM_KIX <- my_data%>% filter(brand == "GENERAL MILLS KIX")
# Evaluate promotion with T test
t.test(GM_KIX$sales[GM_KIX$promo == "0"],GM_KIX$sales[GM_KIX$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_KIX$sales[GM_KIX$promo == "0"] and GM_KIX$sales[GM_KIX$promo == "1"]
## t = -0.34451, df = 335.06, p-value = 0.7307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.471542 2.436775
## sample estimates:
## mean of x mean of y
## 26.89814 27.41553
t.test(GM_KIX$units[GM_KIX$promo == "0"],GM_KIX$units[GM_KIX$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_KIX$units[GM_KIX$promo == "0"] and GM_KIX$units[GM_KIX$promo == "1"]
## t = -5.7239, df = 254.31, p-value = 2.922e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.166095 -2.033207
## sample estimates:
## mean of x mean of y
## 7.076229 10.175879
# Test visulization
GM_KIX %>%
group_by(promo) %>%
summarise(m = median(sales),sd = sd(sales),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = promo,y = m,fill = promo)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
# reuslt
KIX_REGULAR <- c("No Difference ","Increase")
Test result :
sale no differece !! / Unit increase
# 5 GENERAL MILLS LUCKY CHARMS - sale iNcrease / unit increase
GM_LUCKY <- my_data%>% filter(brand == "GENERAL MILLS LUCKY CHARMS")
# Evaluate promotion with T test
t.test(GM_LUCKY$sales[GM_LUCKY$promo == "0"],GM_LUCKY$sales[GM_LUCKY$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_LUCKY$sales[GM_LUCKY$promo == "0"] and GM_LUCKY$sales[GM_LUCKY$promo == "1"]
## t = -2.9748, df = 476.11, p-value = 0.003081
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.561649 -1.545826
## sample estimates:
## mean of x mean of y
## 35.36537 39.91911
t.test(GM_LUCKY$units[GM_LUCKY$promo == "0"],GM_LUCKY$units[GM_LUCKY$promo == "1"])
##
## Welch Two Sample t-test
##
## data: GM_LUCKY$units[GM_LUCKY$promo == "0"] and GM_LUCKY$units[GM_LUCKY$promo == "1"]
## t = -10.78, df = 413.78, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.582192 -3.860398
## sample estimates:
## mean of x mean of y
## 8.234261 12.955556
# Test visulization
GM_LUCKY %>%
group_by(promo) %>%
summarise(m = median(sales),sd = sd(sales),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = promo,y = m,fill = promo)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
# reuslt
LUCKY_TOAST <- c("Increase ","Increase")
Test result :
Sale increase / Unit increase
For the promotion analysis, we will analyze only the unit, since the price should not be different.
#By unit
GM_record %>%
group_by(brand, ad)%>%
summarise(mean_unit = mean(units))%>%
ggplot(aes(x = brand, y = mean_unit, fill = ad))+
geom_bar(stat = "identity", position = "dodge")+
coord_flip()+
labs (title = "Ad Impact on unit")
Finding:
Small and median ad both increased the unit sold. However, median ad didn’t always do much better than small ad.
# 1 GENERAL MILLS CHEERIOS
GM_Cherrios <- my_data%>% filter(brand == "GENERAL MILLS CHEERIOS")
# Check if there are enough of poeple in each group
GM_Cherrios %>% group_by(ad)%>% summarise(n())
## # A tibble: 3 x 2
## ad `n()`
## <fct> <int>
## 1 A 79
## 2 B 44
## 3 NONE 1335
# T test
t.test(GM_Cherrios$units[GM_Cherrios$ad == "NONE"],GM_Cherrios$units[GM_Cherrios$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_Cherrios$units[GM_Cherrios$ad == "NONE"] and GM_Cherrios$units[GM_Cherrios$ad == "A"]
## t = -0.31128, df = 86.065, p-value = 0.7563
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.099007 1.530648
## sample estimates:
## mean of x mean of y
## 13.91835 14.20253
t.test(GM_Cherrios$units[GM_Cherrios$ad == "NONE"],GM_Cherrios$units[GM_Cherrios$ad == "B"])
##
## Welch Two Sample t-test
##
## data: GM_Cherrios$units[GM_Cherrios$ad == "NONE"] and GM_Cherrios$units[GM_Cherrios$ad == "B"]
## t = -2.6607, df = 46.833, p-value = 0.01065
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.5737406 -0.6350098
## sample estimates:
## mean of x mean of y
## 13.91835 16.52273
t.test(GM_Cherrios$units[GM_Cherrios$ad == "B"],GM_Cherrios$units[GM_Cherrios$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_Cherrios$units[GM_Cherrios$ad == "B"] and GM_Cherrios$units[GM_Cherrios$ad == "A"]
## t = 1.7736, df = 105.85, p-value = 0.07901
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.273517 4.913908
## sample estimates:
## mean of x mean of y
## 16.52273 14.20253
# test visulization
GM_Cherrios %>%
group_by(ad) %>%
summarise(m = median(units),sd = sd(units),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = ad,y = m, fill = ad)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
GM_Cherrios_ad <- c("TOAST","No Difference","Increase", "No Difference")
Test result : None - A : no difference / None - B : increase /A -B : no difference
# 2 GENERAL MILLS CINNAMON TST CR
GM_CINNAMON <- my_data%>% filter(brand == "GENERAL MILLS CINNAMON TST CR")
# Check if there are enough of poeple in each group
GM_CINNAMON %>% group_by(ad)%>% summarise(n())
## # A tibble: 3 x 2
## ad `n()`
## <fct> <int>
## 1 A 114
## 2 B 51
## 3 NONE 1669
# T test
t.test(GM_CINNAMON$units[GM_CINNAMON$ad == "NONE"],GM_CINNAMON$units[GM_CINNAMON$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_CINNAMON$units[GM_CINNAMON$ad == "NONE"] and GM_CINNAMON$units[GM_CINNAMON$ad == "A"]
## t = -3.8624, df = 124.53, p-value = 0.0001795
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.002526 -1.290327
## sample estimates:
## mean of x mean of y
## 8.502696 11.149123
t.test(GM_CINNAMON$units[GM_CINNAMON$ad == "NONE"],GM_CINNAMON$units[GM_CINNAMON$ad == "B"])
##
## Welch Two Sample t-test
##
## data: GM_CINNAMON$units[GM_CINNAMON$ad == "NONE"] and GM_CINNAMON$units[GM_CINNAMON$ad == "B"]
## t = -2.6003, df = 52.081, p-value = 0.01209
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.8412422 -0.6239536
## sample estimates:
## mean of x mean of y
## 8.502696 11.235294
t.test(GM_CINNAMON$units[GM_CINNAMON$ad == "B"],GM_CINNAMON$units[GM_CINNAMON$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_CINNAMON$units[GM_CINNAMON$ad == "B"] and GM_CINNAMON$units[GM_CINNAMON$ad == "A"]
## t = 0.069684, df = 92.85, p-value = 0.9446
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.369541 2.541884
## sample estimates:
## mean of x mean of y
## 11.23529 11.14912
# test visulization
GM_CINNAMON%>%
group_by(ad) %>%
summarise(m = median(units),sd = sd(units),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = ad,y = m, fill = ad)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
GM_CINNAMON_ad <- c("CINNAMON","Increase","Increase", "No Difference")
Test result : None - A : increase / None - B : increase /A -B : no difference
# 3 GENERAL MILLS COCOA PUFFS
GM_COCOA <- my_data%>% filter(brand == "GENERAL MILLS COCOA PUFFS")
# Check if there are enough of poeple in each group
GM_COCOA %>% group_by(ad)%>% summarise(n())
## # A tibble: 3 x 2
## ad `n()`
## <fct> <int>
## 1 A 76
## 2 B 54
## 3 NONE 890
# T test
t.test(GM_COCOA$units[GM_COCOA$ad == "NONE"],GM_COCOA$units[GM_COCOA$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_COCOA$units[GM_COCOA$ad == "NONE"] and GM_COCOA$units[GM_COCOA$ad == "A"]
## t = -3.7672, df = 83.723, p-value = 0.0003061
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.169132 -1.597219
## sample estimates:
## mean of x mean of y
## 7.524719 10.907895
t.test(GM_COCOA$units[GM_COCOA$ad == "NONE"],GM_COCOA$units[GM_COCOA$ad == "B"])
##
## Welch Two Sample t-test
##
## data: GM_COCOA$units[GM_COCOA$ad == "NONE"] and GM_COCOA$units[GM_COCOA$ad == "B"]
## t = -3.614, df = 57.162, p-value = 0.0006378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.062705 -1.739709
## sample estimates:
## mean of x mean of y
## 7.524719 11.425926
t.test(GM_COCOA$units[GM_COCOA$ad == "B"],GM_COCOA$units[GM_COCOA$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_COCOA$units[GM_COCOA$ad == "B"] and GM_COCOA$units[GM_COCOA$ad == "A"]
## t = 0.37729, df = 112.76, p-value = 0.7067
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.202269 3.238331
## sample estimates:
## mean of x mean of y
## 11.42593 10.90789
# test visulization
GM_COCOA %>%
group_by(ad) %>%
summarise(m = median(units),sd = sd(units),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = ad,y = m, fill = ad)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
GM_COCOA_ad <- c("COCOA","Increase","Increase", "No Difference")
Test result : None - A : increase / None - B : increase /A -B : no difference
# 4 KIX
GM_KIX <- my_data%>% filter(brand == "GENERAL MILLS KIX")
# Check if there are enough of poeple in each group
GM_KIX %>% group_by(ad)%>% summarise(n())
## # A tibble: 3 x 2
## ad `n()`
## <fct> <int>
## 1 A 70
## 2 B 54
## 3 NONE 1072
# T test
t.test(GM_KIX$units[GM_KIX$ad == "NONE"],GM_KIX$units[GM_KIX$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_KIX$units[GM_KIX$ad == "NONE"] and GM_KIX$units[GM_KIX$ad == "A"]
## t = -2.8929, df = 75.017, p-value = 0.004994
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.4411898 -0.8189381
## sample estimates:
## mean of x mean of y
## 7.298507 9.928571
t.test(GM_KIX$units[GM_KIX$ad == "NONE"],GM_KIX$units[GM_KIX$ad == "B"])
##
## Welch Two Sample t-test
##
## data: GM_KIX$units[GM_KIX$ad == "NONE"] and GM_KIX$units[GM_KIX$ad == "B"]
## t = -3.0389, df = 56.648, p-value = 0.003589
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.127052 -1.053711
## sample estimates:
## mean of x mean of y
## 7.298507 10.388889
t.test(GM_KIX$units[GM_KIX$ad == "B"],GM_KIX$units[GM_KIX$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_KIX$units[GM_KIX$ad == "B"] and GM_KIX$units[GM_KIX$ad == "A"]
## t = 0.34377, df = 114.87, p-value = 0.7316
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.192052 3.112687
## sample estimates:
## mean of x mean of y
## 10.388889 9.928571
# test visulization
GM_KIX %>%
group_by(ad) %>%
summarise(m = median(units),sd = sd(units),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = ad,y = m, fill = ad)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
GM_KIX_ad <- c("REGULAR","Increase","Increase", "No Difference")
Test result : None - A : increase / None - B : increase /A -B : no difference
# 5 GENERAL MILLS CHEERIOS
GM_LUCKY <- my_data %>% filter(brand == "GENERAL MILLS LUCKY CHARMS")
# Check if there are enough of poeple in each group
GM_LUCKY %>% group_by(ad)%>% summarise(n())
## # A tibble: 3 x 2
## ad `n()`
## <fct> <int>
## 1 A 103
## 2 B 69
## 3 NONE 1509
# T test
t.test(GM_LUCKY$units[GM_LUCKY$ad == "NONE"],GM_LUCKY$units[GM_LUCKY$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_LUCKY$units[GM_LUCKY$ad == "NONE"] and GM_LUCKY$units[GM_LUCKY$ad == "A"]
## t = -5.3748, df = 112.61, p-value = 4.192e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.370615 -2.477610
## sample estimates:
## mean of x mean of y
## 8.784626 12.708738
t.test(GM_LUCKY$units[GM_LUCKY$ad == "NONE"],GM_LUCKY$units[GM_LUCKY$ad == "B"])
##
## Welch Two Sample t-test
##
## data: GM_LUCKY$units[GM_LUCKY$ad == "NONE"] and GM_LUCKY$units[GM_LUCKY$ad == "B"]
## t = -2.662, df = 73.003, p-value = 0.009547
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.0006751 -0.5750012
## sample estimates:
## mean of x mean of y
## 8.784626 11.072464
t.test(GM_LUCKY$units[GM_LUCKY$ad == "B"],GM_LUCKY$units[GM_LUCKY$ad == "A"])
##
## Welch Two Sample t-test
##
## data: GM_LUCKY$units[GM_LUCKY$ad == "B"] and GM_LUCKY$units[GM_LUCKY$ad == "A"]
## t = -1.4814, df = 148.93, p-value = 0.1406
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.8189469 0.5463988
## sample estimates:
## mean of x mean of y
## 11.07246 12.70874
# test visulization
GM_LUCKY %>%
group_by(ad) %>%
summarise(m = median(units),sd = sd(units),n = n(), ci = z*sd/sqrt(n))%>% ggplot(aes(x = ad,y = m, fill = ad)) +geom_bar(stat = "identity",position = "dodge") + geom_errorbar(aes(ymin = m - ci, ymax = m + ci),width = 0.5)
GM_LUCKY_ad <- c("TOAST","Increase","Increase", "No Difference")
Test result : None - A : increase / None - B : increase /A -B : no difference
GM_brand <- c(" ", "GENERAL MILLS CHEERIOS", "GENERAL MILLS LUCKY CHARMS","GENERAL MILLS CINNAMON TST CR","GENERAL MILLS COCOA PUFFS","GENERAL MILLS KIX")
GM_promotion <- c("Sales","Units")
promo_eva <- data.frame(GM_promotion, CHEERIOS_TOAST, TST_CINNAMON,PUFFS_COCOA,KIX_REGULAR,LUCKY_TOAST)
pic_table1 <- formattable(promo_eva)
pic_table1
| GM_promotion | CHEERIOS_TOAST | TST_CINNAMON | PUFFS_COCOA | KIX_REGULAR | LUCKY_TOAST |
|---|---|---|---|---|---|
| Sales | Decrease | Increase | Increase | No Difference | Increase |
| Units | Increase | Increase | Increase | Increase | Increase |
png("pic_table1.png",width =800, height=200,bg = "white")
grid.table(pic_table1)
dev.off()
## quartz_off_screen
## 2
#export_formattable(pic_table1,"pic_table1.png")
GM_brand <- c(" ", "GENERAL MILLS CHEERIOS", "GENERAL MILLS LUCKY CHARMS","GENERAL MILLS CINNAMON TST CR","GENERAL MILLS COCOA PUFFS","GENERAL MILLS KIX")
GM_Advertisement <- c("Flavor","Medium Ad","Small Ad"," Medium/Small")
ad_eva <- data.frame(GM_Advertisement ,GM_Cherrios_ad,GM_CINNAMON_ad,GM_COCOA_ad,GM_KIX_ad,GM_LUCKY_ad)
pic_table2 <- formattable(ad_eva)
pic_table2
| GM_Advertisement | GM_Cherrios_ad | GM_CINNAMON_ad | GM_COCOA_ad | GM_KIX_ad | GM_LUCKY_ad |
|---|---|---|---|---|---|
| Flavor | TOAST | CINNAMON | COCOA | REGULAR | TOAST |
| Medium Ad | No Difference | Increase | Increase | Increase | Increase |
| Small Ad | Increase | Increase | Increase | Increase | Increase |
| Medium/Small | No Difference | No Difference | No Difference | No Difference | No Difference |
png("pic_table2.png",width =800, height=200,bg = "white")
grid.table(pic_table2)
dev.off()
## quartz_off_screen
## 2
Conclusion :
# Kick out 7 record from GM in Regular but not from GENERAL MILLS LUCKY CHARMS
regular_sales <- my_data %>%
filter( flavor == "REGULAR" & brand != "GENERAL MILLS LUCKY CHARMS" & brand != "GENERAL MILLS CHEERIOS" ) %>%
mutate(volume_price = price / volume)
grid.arrange(
sales_GM_P <- regular_sales %>%
filter(producer!= "KELLOGGS") %>%
group_by(producer,week) %>%
summarise(sum_sales = sum(sales)) %>%
ggplot(aes(x = week, y = sum_sales, color = producer))+
geom_line()+
scale_y_continuous(labels = scales::dollar) +
labs(title = "Sales Pattern", color = " ") +
labs(title = "GM & Post Regular flavor Sales Pattern", x = "Week", y = "Total Sales", color =" ") +
scale_color_manual(values = c("#339966", "#0066cc"), guide = guide_legend(reverse = TRUE)) +
theme_classic(),
sales_GM_K <- regular_sales %>%
filter(producer != "POST") %>%
group_by(producer,week) %>%
summarise(sum_sales = sum(sales)) %>%
ggplot(aes(x = week, y = sum_sales, color = producer)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
labs(title = "GM & Kelloggs Regular flavor Sales Pattern", x = "Week", y = "Total Sales", color =" ") +
scale_color_manual(values = c("#339966", "#ff6600"), guide = guide_legend(reverse = TRUE)) +
theme(plot.title = element_text(size=20, face = "bold")) +
theme_classic(),
ncol = 1
)
ggsave(filename = "sales_GM_P.png", width = 6, height = 2, plot = sales_GM_P)
ggsave(filename = "sales_GM_K.png", width = 6, height = 2, plot = sales_GM_K)
Finding :
We almost share the same sale pattern with Kelloggs, but not with the post. We can conclude that Kelloggs and we are selling to the same market with the same need. But why customers choose Kelloggs rather than us?
For the following, we list out several possible hypotheses and we tested them. We will focus mainly on the difference between us and Kelloggs since we share the same market.
regular_sales <- my_data %>%
filter( flavor == "REGULAR" & brand != "GENERAL MILLS LUCKY CHARMS" & brand != "GENERAL MILLS CHEERIOS" ) %>%
mutate(volume_price = price / volume)
price_volumn <- regular_sales %>%
group_by(producer,brand)%>%
mutate(unit_price = median(volume_price), n = n(), sd = sd(unit_price), ci = z*sd/sqrt(sd)) %>%
ggplot(aes(x = producer, y = unit_price , fill =brand))+
geom_bar(stat = "identity",position = "dodge") +
scale_y_continuous(labels = scales::dollar) +
labs(title = "Price per Volume in Regular Flavor", x = "Producer", y = "Price per Volume") +
theme(plot.title = element_text(size=24, face = "bold")) +
theme(legend.title = element_text(size=12), legend.text=element_text(size=5)) +
theme_classic()
price_volumn
ggsave(filename = "price_volumn.png", plot = price_volumn)
## Saving 7 x 5 in image
Finding : price per weight is higher than other brand
T test for finding :
# with Kelloggs
t.test(regular_sales$volume_price[regular_sales$producer == "GM"],regular_sales$volume_price[regular_sales$producer == "KELLOGGS"])
##
## Welch Two Sample t-test
##
## data: regular_sales$volume_price[regular_sales$producer == "GM"] and regular_sales$volume_price[regular_sales$producer == "KELLOGGS"]
## t = 49.93, df = 1905.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.683094 1.820721
## sample estimates:
## mean of x mean of y
## 5.027739 3.275832
# with post
t.test(regular_sales$volume_price[regular_sales$producer == "GM"],regular_sales$volume_price[regular_sales$producer == "POST"])
##
## Welch Two Sample t-test
##
## data: regular_sales$volume_price[regular_sales$producer == "GM"] and regular_sales$volume_price[regular_sales$producer == "POST"]
## t = 55.446, df = 1943.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.898393 2.037614
## sample estimates:
## mean of x mean of y
## 5.027739 3.059736
Test result :
Hypothesis confirmed, we are more expensive per volume than the other competitors in Regular
grid.arrange(
# for promotion
my_data %>%
group_by(flavor, promo,producer) %>%
ggplot(aes(x = flavor, fill = promo)) +
geom_bar(stat = "count",position = "dodge") +
facet_grid(producer~.),
# for ad
my_data %>%
group_by(flavor, promo,producer) %>%
ggplot(aes(x = flavor, fill = ad)) +
geom_bar(stat = "count",position = "dodge") +
facet_grid(producer~.),
ncol = 1
)
Findings:
For Promo
For Ad
Test for the hypothesis:
# Promotion holding frequency
GM_K <- my_data %>% filter (producer %in% c("GM","KELLOGGS") )
chisq.test(table(GM_K$promo, GM_K$promo))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(GM_K$promo, GM_K$promo)
## X-squared = 19366, df = 1, p-value < 2.2e-16
GM_P <- my_data %>% filter (producer %in% c("GM","POST") )
chisq.test(table(GM_P$promo, GM_P$promo))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(GM_P$promo, GM_P$promo)
## X-squared = 9660.5, df = 1, p-value < 2.2e-16
Test result :
Both Kelloggs and Post held more promotion then us.
# Test promotion impact on sales for Kelloggs and Post
# Kelloggs for all
t.test( K_record$sales[K_record$promo == "0"],K_record$sales[K_record$promo == "1"])
##
## Welch Two Sample t-test
##
## data: K_record$sales[K_record$promo == "0"] and K_record$sales[K_record$promo == "1"]
## t = -6.8113, df = 4638.1, p-value = 1.091e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.365500 -2.414138
## sample estimates:
## mean of x mean of y
## 29.17031 32.56013
# Kelloggs in Regular
k_r <-
K_record %>%
filter(flavor == "REGULAR")
t.test( k_r$sales[k_r$promo == "0"],k_r$sales[k_r$promo == "1"])
##
## Welch Two Sample t-test
##
## data: k_r$sales[k_r$promo == "0"] and k_r$sales[k_r$promo == "1"]
## t = -4.2765, df = 1658.5, p-value = 2.007e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.118801 -1.899749
## sample estimates:
## mean of x mean of y
## 34.11607 37.62535
# Post
t.test( P_record$sales[P_record$promo == "0"],P_record$sales[P_record$promo == "1"])
##
## Welch Two Sample t-test
##
## data: P_record$sales[P_record$promo == "0"] and P_record$sales[P_record$promo == "1"]
## t = -1.4442, df = 943.51, p-value = 0.149
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.0985188 0.4714192
## sample estimates:
## mean of x mean of y
## 22.44696 23.76051
Test result:
For Kelloggs :
p-value for all flavor :1.091e-11
p-value in Regular : 2.007e-05
Kelloggs promotion did help the sale, for the all brands and regular.
For Post :
p-value in Regular 0.149
Promotion did not increase the sales
Advertisment holding Frequecy
# ad do increase the sales and unit for all brand
regular_sales %>%
group_by(producer, ad) %>%
summarise( unit_sold = median(sales)) %>%
ggplot(aes(x = producer, y =unit_sold, fill = ad)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title= "Ad affects unit sold")
Finding: After making sure that ad does increasing sale perfomance in all producer
# promo holding frequency
regular_sales %>%
group_by(producer,ad) %>%
ggplot(aes(producer, fill = ad)) +
geom_bar(position = "fill")+
coord_flip()+
labs(title = "Ad Holding Frequency")
Finding: We did less than others in regular flavor. This maybe another reason that we lost sales in regular.
After analyzing, we summarized several findings as follow:
From the perspective of producer, we lost sales to Kelloggs but won post
From the perspective of flavor, we lost mainly in Regular, to both Kelloggs and Post.
The previous problem can cause by :
Too less brand in this flavor for customers to choose
Unnecessary promotion, which decrease the total revenue instead of boosted.
Too expensive price per volume
Too low advertisement holding frequency
We held too least promotion and advertisement compared to our competitors.
For all brands we have, promotion did increase unit sold, but for Cherrios the sales decreased and for KIX promotion didn’t make difference in unit sold.
For all brands we have, median and small advertisement don’t make difference in unit sold. For cost effective reason, we should stop holding median advertisement.
We should consider selling “Fruit” flavor, because that’s the only flavor we don’t have.
`