Which demographic group(s) contribute the most to our revenue, and how can we further capture market share in these segments?
library(completejourney)
library(dplyr)
library(ggplot2)
library(knitr)
transactions <- get_transactions()
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
demographics
## # A tibble: 801 × 8
## household_id age income home_ownership marital_status household_size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1 65+ 35-49K Homeowner Married 2
## 2 1001 45-54 50-74K Homeowner Unmarried 1
## 3 1003 35-44 25-34K <NA> Unmarried 1
## 4 1004 25-34 15-24K <NA> Unmarried 1
## 5 101 45-54 Under 15K Homeowner Married 4
## 6 1012 35-44 35-49K <NA> Married 5+
## 7 1014 45-54 15-24K <NA> Married 4
## 8 1015 45-54 50-74K Homeowner Unmarried 1
## 9 1018 45-54 35-49K Homeowner Married 5+
## 10 1020 45-54 25-34K Homeowner Married 2
## # ℹ 791 more rows
## # ℹ 2 more variables: household_comp <ord>, kids_count <ord>
str(transactions)
## tibble [1,469,307 × 11] (S3: tbl_df/tbl/data.frame)
## $ household_id : chr [1:1469307] "900" "900" "1228" "906" ...
## $ store_id : chr [1:1469307] "330" "330" "406" "319" ...
## $ basket_id : chr [1:1469307] "31198570044" "31198570047" "31198655051" "31198705046" ...
## $ product_id : chr [1:1469307] "1095275" "9878513" "1041453" "1020156" ...
## $ quantity : num [1:1469307] 1 1 1 1 2 1 1 1 1 1 ...
## $ sales_value : num [1:1469307] 0.5 0.99 1.43 1.5 2.78 5.49 1.5 1.88 1.5 2.69 ...
## $ retail_disc : num [1:1469307] 0 0.1 0.15 0.29 0.8 0.5 0.29 0.21 1.29 0 ...
## $ coupon_disc : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
## $ coupon_match_disc : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
## $ week : int [1:1469307] 1 1 1 1 1 1 1 1 1 1 ...
## $ transaction_timestamp: POSIXct[1:1469307], format: "2017-01-01 06:53:26" "2017-01-01 07:10:28" ...
str(products)
## spc_tbl_ [92,331 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ product_id : chr [1:92331] "25671" "26081" "26093" "26190" ...
## $ manufacturer_id : chr [1:92331] "2" "2" "69" "69" ...
## $ department : chr [1:92331] "GROCERY" "MISCELLANEOUS" "PASTRY" "GROCERY" ...
## $ brand : Factor w/ 2 levels "National","Private": 1 1 2 2 2 2 2 2 2 2 ...
## $ product_category: chr [1:92331] "FRZN ICE" NA "BREAD" "FRUIT - SHELF STABLE" ...
## $ product_type : chr [1:92331] "ICE - CRUSHED/CUBED" NA "BREAD:ITALIAN/FRENCH" "APPLE SAUCE" ...
## $ package_size : chr [1:92331] "22 LB" NA NA "50 OZ" ...
str(demographics)
## spc_tbl_ [801 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ household_id : chr [1:801] "1" "1001" "1003" "1004" ...
## $ age : Ord.factor w/ 6 levels "19-24"<"25-34"<..: 6 4 3 2 4 3 4 4 4 4 ...
## $ income : Ord.factor w/ 12 levels "Under 15K"<"15-24K"<..: 4 5 3 2 1 4 2 5 4 3 ...
## $ home_ownership: Ord.factor w/ 5 levels "Renter"<"Probable Renter"<..: 3 3 NA NA 3 NA NA 3 3 3 ...
## $ marital_status: Ord.factor w/ 3 levels "Married"<"Unmarried"<..: 1 2 2 2 1 1 1 2 1 1 ...
## $ household_size: Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 2 1 1 1 4 5 4 1 5 2 ...
## $ household_comp: Ord.factor w/ 5 levels "1 Adult Kids"<..: 4 2 2 2 3 3 3 2 3 4 ...
## $ kids_count : Ord.factor w/ 5 levels "0"<"1"<"2"<"3+"<..: 1 1 1 1 3 4 3 1 4 1 ...
combined_data <- transactions %>%
inner_join(products, by = "product_id") %>%
inner_join(demographics, by = "household_id")
revenue_by_demographic <- combined_data %>%
group_by(age, income, home_ownership, marital_status) %>%
summarise(total_revenue = sum(sales_value)) %>%
arrange(desc(total_revenue))
top_demographic <- revenue_by_demographic[1, ]
average_revenue_by_marital_status <- revenue_by_demographic %>%
group_by(marital_status) %>%
summarise(average_revenue = mean(total_revenue))
ggplot(average_revenue_by_marital_status, aes(x = reorder(marital_status, -average_revenue), y = average_revenue)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(x = "Marital Status", y = "Average Revenue") +
coord_flip()
There is a strong correlation between higher average revenue for stores from married individuals. Unmarried or an unkown relationship usually leads to less average sales revenue for stores.
average_revenue_by_age <- revenue_by_demographic %>%
group_by(age) %>%
summarise(average_revenue = mean(total_revenue))
ggplot(average_revenue_by_age, aes(x = reorder(age, -average_revenue), y = average_revenue)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(x = "Age Group", y = "Average Revenue") +
coord_flip()
The three age groups from 25-54 have the highest average revenue for stores, and there is a large difference between their average revenue and the two age groups from 55+ as well as the 19-24 year old age group.
filtered_data <- revenue_by_demographic %>%
filter(marital_status %in% c("Married", "Unmarried")) %>%
group_by(age, marital_status) %>%
summarise(count = n(), average_revenue = mean(total_revenue))
ggplot(filtered_data, aes(x = age, y = count, fill = marital_status)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Age Group", y = "Count", fill = "Marital Status") +
scale_fill_brewer(palette = "Blues") +
facet_wrap(~ marital_status, scales = "free", ncol = 2, strip.position = "bottom") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Now we show the amount of people shown by age and marital status to see how large each demographic is and which ones will be best to focus on. Ages25-54 have a higher amount of people and will be the best to focus on.
average_sales_value_by_marital_status <- combined_data %>%
group_by(marital_status) %>%
summarise(average_sales_value = mean(sales_value, na.rm = TRUE)) # Hand
ggplot(average_sales_value_by_marital_status, aes(x = average_sales_value, y = reorder(marital_status, -average_sales_value)))+
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(x = "Average Sales Value", y = "Marital Status") +
coord_flip()
The average sale for each individual isn’t drastically different, people of all marital status’ are spending roughly the same amount per sale.
average_sales_value_by_age <- combined_data %>%
group_by(age) %>%
summarise(average_sales_value = mean(sales_value, na.rm = TRUE))
ggplot(average_sales_value_by_age, aes(x = average_sales_value, y = reorder(age, -average_sales_value)))+
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(x = "Average Sales Value", y = "Age Group") +
coord_flip()
Again, here we can see there is not much of a difference for average sales value for each of the age groups.
average_sales_value_by_marital_status_age <- combined_data %>%
group_by(age, marital_status) %>%
summarise(average_sales_value = mean(sales_value, na.rm = TRUE)) %>%
filter(marital_status %in% c("Married", "Unmarried"))
ggplot(average_sales_value_by_marital_status_age, aes(x = age, y = average_sales_value, fill = marital_status)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Age Group", y = "Average Sales Value", fill = "Marital Status") +
scale_fill_brewer(palette = "Blues") +
facet_wrap(~ marital_status, scales = "free", ncol = 2, strip.position = "bottom") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Lastly, we see there is only a slight difference between each age group and marital status, with unmarried 55-64 year old’s spending the most per sale and unmarried 65+ year old’s spending the least per sale.
The three age groups from 25-54 have the highest average revenue for stores, and there is a large difference between their average revenue and the two age groups from 55+ as well as the 19-24 year old age group. These groups are driving a large portion of revenue. When looking at the data, we realize that the average sale amount is roughly the same across all demographics. This means we need to focus on increasing volume, whether that is by trying to drive more consumers that are underrepresented in this data such as unmarried people. Married people account for more revenue on an average basis compared to unmarried or unknown relationship status’, and this could lead to the opportunity to fully maximize this and target married people to capture even more of that market.