library(completejourney)
## Warning: package 'completejourney' was built under R version 4.1.3
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
transactions <- transactions_sample
products
## # A tibble: 92,331 x 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio~ FRZN ICE ICE - CRUSH~
## 2 26081 2 MISCELLANEOUS Natio~ <NA> <NA>
## 3 26093 69 PASTRY Priva~ BREAD BREAD:ITALI~
## 4 26190 69 GROCERY Priva~ FRUIT - SHELF S~ APPLE SAUCE
## 5 26355 69 GROCERY Priva~ COOKIES/CONES SPECIALTY C~
## 6 26426 69 GROCERY Priva~ SPICES & EXTRAC~ SPICES & SE~
## 7 26540 69 GROCERY Priva~ COOKIES/CONES TRAY PACK/C~
## 8 26601 69 DRUG GM Priva~ VITAMINS VITAMIN - M~
## 9 26636 69 PASTRY Priva~ BREAKFAST SWEETS SW GDS: SW ~
## 10 26691 16 GROCERY Priva~ PNT BTR/JELLY/J~ HONEY
## # ... with 92,321 more rows, and 1 more variable: package_size <chr>
transactions
## # A tibble: 75,000 x 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # ... with 74,990 more rows, and 4 more variables: coupon_disc <dbl>,
## # coupon_match_disc <dbl>, week <int>, transaction_timestamp <dttm>
demographics
## # A tibble: 801 x 8
## household_id age income home_ownership marital_status household_size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1 65+ 35-49K Homeowner Married 2
## 2 1001 45-54 50-74K Homeowner Unmarried 1
## 3 1003 35-44 25-34K <NA> Unmarried 1
## 4 1004 25-34 15-24K <NA> Unmarried 1
## 5 101 45-54 Under 15K Homeowner Married 4
## 6 1012 35-44 35-49K <NA> Married 5+
## 7 1014 45-54 15-24K <NA> Married 4
## 8 1015 45-54 50-74K Homeowner Unmarried 1
## 9 1018 45-54 35-49K Homeowner Married 5+
## 10 1020 45-54 25-34K Homeowner Married 2
## # ... with 791 more rows, and 2 more variables: household_comp <ord>,
## # kids_count <ord>
ttrans <- transactions %>%
group_by(basket_id) %>%
summarize(sales_val = mean(sales_value, na.rm = TRUE)) %>%
filter(sales_val > 20)
ggplot(ttrans, aes("Basket ID", sales_val)) +
geom_boxplot(outlier.alpha = .25) +
scale_y_log10(
labels = scales::dollar,
breaks = quantile(transactions$sales_val)
)
## Warning: Unknown or uninitialised column: `sales_val`.
ggplot(data = demographics, aes(x=age,y=income)) +
geom_point(alpha = 0.1) +
labs(x="Age", y="Income", title="Income by Age")
ggplot(transactions, aes(sales_value)) +
geom_histogram(binwidth = 1) +
ggtitle("Amount of different sales values in the Transactions Data (Bin Width = 1")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.