rm(list = ls())
#import data
mars <- read_csv("petfood_retail_table.csv", show_col_types = FALSE)
mars <- as.data.frame(mars)
# check column type
kable(sapply(mars, class), col.names = "Column Type")
| Column Type | |
|---|---|
| DATE | Date |
| SHOPPER_ID | numeric |
| SHOPPER_BASKET | character |
| STORE_ID | numeric |
| ZIP_CODE | numeric |
| MANU_ID | character |
| BRAND_ID | character |
| SUB_BRAND_ID | character |
| PRODUCT | numeric |
| SPECIES | character |
| FOOD_TYPE | character |
| BASE_PRICE | numeric |
| PRICE | numeric |
| PROMO_FLAG | numeric |
| UNITS | numeric |
| PRICE_TIER | character |
| PACKAGE_SIZE | character |
| FEEDING_PHILOSOPHY | character |
| NATURAL_CLAIM | character |
| TEXTURE_FORMAT | character |
| LIFESTAGE | character |
# create data summary
stargazer(mars, type = "text",
title = "Summary Statistics")
##
## Summary Statistics
## ====================================================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------------------------------------------
## SHOPPER_ID 8,890 2,857,664.000 302.414 2,857,040 2,857,978
## STORE_ID 8,890 759.661 1,041.886 2 6,792
## ZIP_CODE 8,890 36,172.240 12,669.360 15,001 46,032
## PRODUCT 8,890 17,533,833,098.000 35,002,644,965.000 1,669,818,774 133,022,550,933
## BASE_PRICE 8,890 4.546 6.858 0.190 55.990
## PRICE 8,890 4.272 6.537 0.100 55.990
## PROMO_FLAG 8,890 0.264 0.441 0 1
## UNITS 8,890 1.908 1.810 1 31
## ------------------------------------------------------------------------------------
# explore units solt based on MANU_ID and BRAND_ID
ggplot(mars, aes(x=MANU_ID, y = UNITS, fill = BRAND_ID)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90))
# create new discount variable
mars$DISCOUNT <- round(((mars$BASE_PRICE - mars$PRICE)/mars$BASE_PRICE), 2)
# create revenue variable (price * quantity)
mars$REVENUE <- mars$PRICE * mars$UNITS
# purchases for dogs based on MANU_ID
dogs <- mars %>%
filter(SPECIES == "DOG") %>%
group_by(MANU_ID, SPECIES) %>%
tally()
ggplot(dogs, aes(x = MANU_ID, y = n, fill = SPECIES)) +
geom_bar(stat="identity") +
xlab("Manufacturer") +
ylab("Count") +
theme(axis.text.x = element_text(angle = 90))
# purchases for cats based on MANU_ID
cats <- mars %>%
filter(SPECIES == "CAT") %>%
group_by(MANU_ID, SPECIES) %>%
tally()
ggplot(cats, aes(x = MANU_ID, y = n, fill = SPECIES)) +
geom_bar(stat="identity", color = "blue", aes(fill = SPECIES)) +
xlab("Manufacturer") +
ylab("Count") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_manual(values="blue")
# proportion of revenue generated based PROMO_FLAG by each MANU_ID
ggplot(mars, aes(x = MANU_ID, y = REVENUE, fill = as.factor(PROMO_FLAG))) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_brewer(palette = "Pastel1") +
theme(axis.text.x = element_text(angle = 90))
# revenue generated by species for mars products
ggplot(mars_only, aes(x=SPECIES, y=REVENUE, fill = SPECIES)) +
geom_bar(stat = "identity")
# proportion of revenue based on PROMO_FLAG by species (mars only)
ggplot(mars_only, aes(x = SPECIES, y = REVENUE, fill = as.factor(PROMO_FLAG))) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_brewer(palette = "Pastel1")
Proportion of discounted goods between cats and dogs seem to be about the same.
Less than 25% of the purchases are of discounted items.
table(mars_only$SPECIES, mars_only$PROMO_FLAG)
##
## 0 1
## CAT 398 112
## DOG 896 241
# revenue based on price tier
ggplot(mars_only, aes(x=PRICE_TIER, y = REVENUE, fill = PRICE_TIER)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
# revenue based on mars brands
ggplot(mars_only, aes(x=BRAND_ID, y = REVENUE, fill = BRAND_ID)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_brewer(palette = "Accent")
# create month variable
mars_only$MONTH <- as.Date(cut(mars_only$DATE,
breaks = "month"))
# generate revenue by month over time
ggplot(data = mars_only,
aes(MONTH, REVENUE)) +
ggtitle("Monthly Sales") +
stat_summary(fun.y = sum, # adds up all observations for the month
geom = "line") + # or "line"
scale_x_date(
labels = date_format("%Y-%m"),
breaks = "1 month") + # custom x-axis labels
theme(axis.text.x = element_text(angle = 90))