Which demographic group(s) contribute the most to our revenue, and how can we further capture market share in these segments?

library(completejourney)
library(dplyr)
library(ggplot2)
library(knitr)
transactions <- get_transactions()
products
## # A tibble: 92,331 × 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio… FRZN ICE         ICE - CRUSH…
##  2 26081      2               MISCELLANEOUS Natio… <NA>             <NA>        
##  3 26093      69              PASTRY        Priva… BREAD            BREAD:ITALI…
##  4 26190      69              GROCERY       Priva… FRUIT - SHELF S… APPLE SAUCE 
##  5 26355      69              GROCERY       Priva… COOKIES/CONES    SPECIALTY C…
##  6 26426      69              GROCERY       Priva… SPICES & EXTRAC… SPICES & SE…
##  7 26540      69              GROCERY       Priva… COOKIES/CONES    TRAY PACK/C…
##  8 26601      69              DRUG GM       Priva… VITAMINS         VITAMIN - M…
##  9 26636      69              PASTRY        Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691      16              GROCERY       Priva… PNT BTR/JELLY/J… HONEY       
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
demographics
## # A tibble: 801 × 8
##    household_id age   income    home_ownership marital_status household_size
##    <chr>        <ord> <ord>     <ord>          <ord>          <ord>         
##  1 1            65+   35-49K    Homeowner      Married        2             
##  2 1001         45-54 50-74K    Homeowner      Unmarried      1             
##  3 1003         35-44 25-34K    <NA>           Unmarried      1             
##  4 1004         25-34 15-24K    <NA>           Unmarried      1             
##  5 101          45-54 Under 15K Homeowner      Married        4             
##  6 1012         35-44 35-49K    <NA>           Married        5+            
##  7 1014         45-54 15-24K    <NA>           Married        4             
##  8 1015         45-54 50-74K    Homeowner      Unmarried      1             
##  9 1018         45-54 35-49K    Homeowner      Married        5+            
## 10 1020         45-54 25-34K    Homeowner      Married        2             
## # ℹ 791 more rows
## # ℹ 2 more variables: household_comp <ord>, kids_count <ord>
str(transactions)
## tibble [1,469,307 × 11] (S3: tbl_df/tbl/data.frame)
##  $ household_id         : chr [1:1469307] "900" "900" "1228" "906" ...
##  $ store_id             : chr [1:1469307] "330" "330" "406" "319" ...
##  $ basket_id            : chr [1:1469307] "31198570044" "31198570047" "31198655051" "31198705046" ...
##  $ product_id           : chr [1:1469307] "1095275" "9878513" "1041453" "1020156" ...
##  $ quantity             : num [1:1469307] 1 1 1 1 2 1 1 1 1 1 ...
##  $ sales_value          : num [1:1469307] 0.5 0.99 1.43 1.5 2.78 5.49 1.5 1.88 1.5 2.69 ...
##  $ retail_disc          : num [1:1469307] 0 0.1 0.15 0.29 0.8 0.5 0.29 0.21 1.29 0 ...
##  $ coupon_disc          : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
##  $ coupon_match_disc    : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
##  $ week                 : int [1:1469307] 1 1 1 1 1 1 1 1 1 1 ...
##  $ transaction_timestamp: POSIXct[1:1469307], format: "2017-01-01 06:53:26" "2017-01-01 07:10:28" ...
str(products)
## spc_tbl_ [92,331 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ product_id      : chr [1:92331] "25671" "26081" "26093" "26190" ...
##  $ manufacturer_id : chr [1:92331] "2" "2" "69" "69" ...
##  $ department      : chr [1:92331] "GROCERY" "MISCELLANEOUS" "PASTRY" "GROCERY" ...
##  $ brand           : Factor w/ 2 levels "National","Private": 1 1 2 2 2 2 2 2 2 2 ...
##  $ product_category: chr [1:92331] "FRZN ICE" NA "BREAD" "FRUIT - SHELF STABLE" ...
##  $ product_type    : chr [1:92331] "ICE - CRUSHED/CUBED" NA "BREAD:ITALIAN/FRENCH" "APPLE SAUCE" ...
##  $ package_size    : chr [1:92331] "22 LB" NA NA "50 OZ" ...
str(demographics)
## spc_tbl_ [801 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ household_id  : chr [1:801] "1" "1001" "1003" "1004" ...
##  $ age           : Ord.factor w/ 6 levels "19-24"<"25-34"<..: 6 4 3 2 4 3 4 4 4 4 ...
##  $ income        : Ord.factor w/ 12 levels "Under 15K"<"15-24K"<..: 4 5 3 2 1 4 2 5 4 3 ...
##  $ home_ownership: Ord.factor w/ 5 levels "Renter"<"Probable Renter"<..: 3 3 NA NA 3 NA NA 3 3 3 ...
##  $ marital_status: Ord.factor w/ 3 levels "Married"<"Unmarried"<..: 1 2 2 2 1 1 1 2 1 1 ...
##  $ household_size: Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 2 1 1 1 4 5 4 1 5 2 ...
##  $ household_comp: Ord.factor w/ 5 levels "1 Adult Kids"<..: 4 2 2 2 3 3 3 2 3 4 ...
##  $ kids_count    : Ord.factor w/ 5 levels "0"<"1"<"2"<"3+"<..: 1 1 1 1 3 4 3 1 4 1 ...
combined_data <- transactions %>%
  inner_join(products, by = "product_id") %>%
  inner_join(demographics, by = "household_id")

revenue_by_demographic <- combined_data %>%
  group_by(age, income, home_ownership, marital_status) %>%
  summarise(total_revenue = sum(sales_value)) %>%
  arrange(desc(total_revenue))

Average Sales by Marital Status

top_demographic <- revenue_by_demographic[1, ]

average_revenue_by_marital_status <- revenue_by_demographic %>%
  group_by(marital_status) %>%
  summarise(average_revenue = mean(total_revenue))

ggplot(average_revenue_by_marital_status, aes(x = reorder(marital_status, -average_revenue), y = average_revenue)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Marital Status", y = "Average Revenue") +
  coord_flip()

There is a strong correlation between higher average revenue for stores from married individuals. Unmarried or an unkown relationship usually leads to less average sales revenue for stores.

Average Sales by Age Group

average_revenue_by_age <- revenue_by_demographic %>%
  group_by(age) %>%
  summarise(average_revenue = mean(total_revenue))

ggplot(average_revenue_by_age, aes(x = reorder(age, -average_revenue), y = average_revenue)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Age Group", y = "Average Revenue") +
  coord_flip()

The three age groups from 25-54 have the highest average revenue for stores, and there is a large difference between their average revenue and the two age groups from 55+ as well as the 19-24 year old age group.

Amount of People in Each Age Group by Marital Status

filtered_data <- revenue_by_demographic %>%
  filter(marital_status %in% c("Married", "Unmarried")) %>%
  group_by(age, marital_status) %>%
  summarise(count = n(), average_revenue = mean(total_revenue))

ggplot(filtered_data, aes(x = age, y = count, fill = marital_status)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(x = "Age Group", y = "Count", fill = "Marital Status") +
  scale_fill_brewer(palette = "Blues") +
  facet_wrap(~ marital_status, scales = "free", ncol = 2, strip.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Now we show the amount of people shown by age and marital status to see how large each demographic is and which ones will be best to focus on. Ages25-54 have a higher amount of people and will be the best to focus on.

Average Sales Value by Marital Status

average_sales_value_by_marital_status <- combined_data %>%
  group_by(marital_status) %>%
  summarise(average_sales_value = mean(sales_value, na.rm = TRUE))  # Hand

ggplot(average_sales_value_by_marital_status, aes(x = average_sales_value, y = reorder(marital_status, -average_sales_value)))+
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Average Sales Value", y = "Marital Status") +
  coord_flip()

The average sale for each individual isn’t drastically different, people of all marital status’ are spending roughly the same amount per sale.

Average Sales value by Age Group

average_sales_value_by_age <- combined_data %>%
  group_by(age) %>%
  summarise(average_sales_value = mean(sales_value, na.rm = TRUE))

ggplot(average_sales_value_by_age, aes(x = average_sales_value, y = reorder(age, -average_sales_value)))+
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Average Sales Value", y = "Age Group") +
  coord_flip()

Again, here we can see there is not much of a difference for average sales value for each of the age groups.

Average Sales Value by Marital Status and Age Group

average_sales_value_by_marital_status_age <- combined_data %>%
  group_by(age, marital_status) %>%
  summarise(average_sales_value = mean(sales_value, na.rm = TRUE)) %>%
  filter(marital_status %in% c("Married", "Unmarried"))

ggplot(average_sales_value_by_marital_status_age, aes(x = age, y = average_sales_value, fill = marital_status)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(x = "Age Group", y = "Average Sales Value", fill = "Marital Status") +
  scale_fill_brewer(palette = "Blues") +
  facet_wrap(~ marital_status, scales = "free", ncol = 2, strip.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Lastly, we see there is only a slight difference between each age group and marital status, with unmarried 55-64 year old’s spending the most per sale and unmarried 65+ year old’s spending the least per sale.

Conclusion:

The three age groups from 25-54 have the highest average revenue for stores, and there is a large difference between their average revenue and the two age groups from 55+ as well as the 19-24 year old age group. These groups are driving a large portion of revenue. When looking at the data, we realize that the average sale amount is roughly the same across all demographics. This means we need to focus on increasing volume, whether that is by trying to drive more consumers that are underrepresented in this data such as unmarried people. Married people account for more revenue on an average basis compared to unmarried or unknown relationship status’, and this could lead to the opportunity to fully maximize this and target married people to capture even more of that market.