rm(list = ls())
#import data
mars <- read_csv("petfood_retail_table.csv", show_col_types = FALSE)
mars <- as.data.frame(mars)
# check column type
kable(sapply(mars, class), col.names = "Column Type")
Column Type
DATE Date
SHOPPER_ID numeric
SHOPPER_BASKET character
STORE_ID numeric
ZIP_CODE numeric
MANU_ID character
BRAND_ID character
SUB_BRAND_ID character
PRODUCT numeric
SPECIES character
FOOD_TYPE character
BASE_PRICE numeric
PRICE numeric
PROMO_FLAG numeric
UNITS numeric
PRICE_TIER character
PACKAGE_SIZE character
FEEDING_PHILOSOPHY character
NATURAL_CLAIM character
TEXTURE_FORMAT character
LIFESTAGE character
# create data summary
stargazer(mars, type = "text", 
          title = "Summary Statistics")
## 
## Summary Statistics
## ====================================================================================
## Statistic    N          Mean             St. Dev.           Min            Max      
## ------------------------------------------------------------------------------------
## SHOPPER_ID 8,890   2,857,664.000         302.414         2,857,040      2,857,978   
## STORE_ID   8,890      759.661           1,041.886            2            6,792     
## ZIP_CODE   8,890     36,172.240         12,669.360        15,001         46,032     
## PRODUCT    8,890 17,533,833,098.000 35,002,644,965.000 1,669,818,774 133,022,550,933
## BASE_PRICE 8,890       4.546              6.858            0.190         55.990     
## PRICE      8,890       4.272              6.537            0.100         55.990     
## PROMO_FLAG 8,890       0.264              0.441              0              1       
## UNITS      8,890       1.908              1.810              1             31       
## ------------------------------------------------------------------------------------
# explore units solt based on MANU_ID and BRAND_ID
ggplot(mars, aes(x=MANU_ID, y = UNITS, fill = BRAND_ID)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90))

# create new discount variable 
mars$DISCOUNT <- round(((mars$BASE_PRICE - mars$PRICE)/mars$BASE_PRICE), 2)
# create revenue variable (price * quantity)
mars$REVENUE <- mars$PRICE * mars$UNITS
# purchases for dogs based on MANU_ID
dogs <- mars %>%
    filter(SPECIES == "DOG") %>%
    group_by(MANU_ID, SPECIES) %>%
    tally()

ggplot(dogs, aes(x = MANU_ID, y = n, fill = SPECIES)) +
    geom_bar(stat="identity") +
    xlab("Manufacturer") +
    ylab("Count") +
    theme(axis.text.x = element_text(angle = 90))

# purchases for cats based on MANU_ID
cats <- mars %>%
    filter(SPECIES == "CAT") %>%
    group_by(MANU_ID, SPECIES) %>%
    tally()

ggplot(cats, aes(x = MANU_ID, y = n, fill = SPECIES)) +
    geom_bar(stat="identity", color = "blue", aes(fill = SPECIES)) +
    xlab("Manufacturer") +
    ylab("Count") +
    theme(axis.text.x = element_text(angle = 90)) +
    scale_fill_manual(values="blue")

# proportion of revenue generated based PROMO_FLAG by each MANU_ID
ggplot(mars, aes(x = MANU_ID, y = REVENUE, fill = as.factor(PROMO_FLAG))) +
  geom_col(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Pastel1") +
   theme(axis.text.x = element_text(angle = 90))

# revenue generated by species for mars products
ggplot(mars_only, aes(x=SPECIES, y=REVENUE, fill = SPECIES)) + 
  geom_bar(stat = "identity")

# proportion of revenue based on PROMO_FLAG by species (mars only)
ggplot(mars_only, aes(x = SPECIES, y = REVENUE, fill = as.factor(PROMO_FLAG))) +
  geom_col(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Pastel1")

Proportion of discounted goods between cats and dogs seem to be about the same.

Less than 25% of the purchases are of discounted items.

table(mars_only$SPECIES, mars_only$PROMO_FLAG)
##      
##         0   1
##   CAT 398 112
##   DOG 896 241
# revenue based on price tier
ggplot(mars_only, aes(x=PRICE_TIER, y = REVENUE, fill = PRICE_TIER)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90))

# revenue based on mars brands
ggplot(mars_only, aes(x=BRAND_ID, y = REVENUE, fill = BRAND_ID)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 90)) + 
  scale_fill_brewer(palette = "Accent")

# create month variable
mars_only$MONTH <- as.Date(cut(mars_only$DATE,
  breaks = "month"))
# generate revenue by month over time 
ggplot(data = mars_only,
  aes(MONTH, REVENUE)) +
  ggtitle("Monthly Sales") +
  stat_summary(fun.y = sum, # adds up all observations for the month
    geom = "line") + # or "line"
  scale_x_date(
    labels = date_format("%Y-%m"),
    breaks = "1 month") +  # custom x-axis labels
    theme(axis.text.x = element_text(angle = 90))