library(completejourney)
## Warning: package 'completejourney' was built under R version 4.1.3
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
transactions <- transactions_sample
products
## # A tibble: 92,331 x 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio~ FRZN ICE         ICE - CRUSH~
##  2 26081      2               MISCELLANEOUS Natio~ <NA>             <NA>        
##  3 26093      69              PASTRY        Priva~ BREAD            BREAD:ITALI~
##  4 26190      69              GROCERY       Priva~ FRUIT - SHELF S~ APPLE SAUCE 
##  5 26355      69              GROCERY       Priva~ COOKIES/CONES    SPECIALTY C~
##  6 26426      69              GROCERY       Priva~ SPICES & EXTRAC~ SPICES & SE~
##  7 26540      69              GROCERY       Priva~ COOKIES/CONES    TRAY PACK/C~
##  8 26601      69              DRUG GM       Priva~ VITAMINS         VITAMIN - M~
##  9 26636      69              PASTRY        Priva~ BREAKFAST SWEETS SW GDS: SW ~
## 10 26691      16              GROCERY       Priva~ PNT BTR/JELLY/J~ HONEY       
## # ... with 92,321 more rows, and 1 more variable: package_size <chr>
transactions
## # A tibble: 75,000 x 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 2261         309      31625220889 940996            1        3.86        0.43
##  2 2131         368      32053127496 873902            1        1.59        0.9 
##  3 511          316      32445856036 847901            1        1           0.69
##  4 400          388      31932241118 13094913          2       11.9         2.9 
##  5 918          340      32074655895 1085604           1        1.29        0   
##  6 718          324      32614612029 883203            1        2.5         0.49
##  7 868          323      32074722463 9884484           1        3.49        0   
##  8 1688         450      34850403304 1028715           1        2           1.79
##  9 467          31782    31280745102 896613            2        6.55        4.44
## 10 1947         32004    32744181707 978497            1        3.99        0   
## # ... with 74,990 more rows, and 4 more variables: coupon_disc <dbl>,
## #   coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>
demographics
## # A tibble: 801 x 8
##    household_id age   income    home_ownership marital_status household_size
##    <chr>        <ord> <ord>     <ord>          <ord>          <ord>         
##  1 1            65+   35-49K    Homeowner      Married        2             
##  2 1001         45-54 50-74K    Homeowner      Unmarried      1             
##  3 1003         35-44 25-34K    <NA>           Unmarried      1             
##  4 1004         25-34 15-24K    <NA>           Unmarried      1             
##  5 101          45-54 Under 15K Homeowner      Married        4             
##  6 1012         35-44 35-49K    <NA>           Married        5+            
##  7 1014         45-54 15-24K    <NA>           Married        4             
##  8 1015         45-54 50-74K    Homeowner      Unmarried      1             
##  9 1018         45-54 35-49K    Homeowner      Married        5+            
## 10 1020         45-54 25-34K    Homeowner      Married        2             
## # ... with 791 more rows, and 2 more variables: household_comp <ord>,
## #   kids_count <ord>
ttrans <- transactions %>%
  group_by(basket_id) %>%
  summarize(sales_val = mean(sales_value, na.rm = TRUE)) %>%
  filter(sales_val > 20)
ggplot(ttrans, aes("Basket ID", sales_val)) +
   geom_boxplot(outlier.alpha = .25) +
   scale_y_log10(
      labels = scales::dollar,
      breaks = quantile(transactions$sales_val)
      )
## Warning: Unknown or uninitialised column: `sales_val`.

ggplot(data = demographics, aes(x=age,y=income)) + 
  geom_point(alpha = 0.1) +
  labs(x="Age", y="Income", title="Income by Age")

ggplot(transactions, aes(sales_value)) +
  geom_histogram(binwidth = 1) +
  ggtitle("Amount of different sales values in the Transactions Data (Bin Width = 1")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.