Based off of the size of the package, how does this influence total
sales value of the transactions?
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
newexample <- transactions_sample %>%
inner_join(demographics) %>%
inner_join(products)
## Joining with `by = join_by(household_id)`
## Joining with `by = join_by(product_id)`
glimpse(newexample)
## Rows: 42,058
## Columns: 24
## $ household_id <chr> "400", "718", "868", "1694", "2154", "1631", "21…
## $ store_id <chr> "388", "324", "323", "446", "343", "293", "32004…
## $ basket_id <chr> "31932241118", "32614612029", "32074722463", "40…
## $ product_id <chr> "13094913", "883203", "9884484", "989069", "1064…
## $ quantity <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 2, 1, …
## $ sales_value <dbl> 11.87, 2.50, 3.49, 2.50, 1.50, 1.88, 1.00, 1.00,…
## $ retail_disc <dbl> 2.90, 0.49, 0.00, 0.49, 0.89, 0.12, 0.39, 0.00, …
## $ coupon_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week <int> 8, 15, 10, 47, 18, 37, 40, 33, 35, 43, 41, 27, 2…
## $ transaction_timestamp <dttm> 2017-02-18 13:13:10, 2017-04-05 18:14:17, 2017-…
## $ age <ord> 35-44, 45-54, 65+, 35-44, 45-54, 19-24, 55-64, 1…
## $ income <ord> 150-174K, 25-34K, 35-49K, 15-24K, 35-49K, Under …
## $ home_ownership <ord> Homeowner, Homeowner, Homeowner, Probable Renter…
## $ marital_status <ord> Married, Married, Married, Unmarried, Married, U…
## $ household_size <ord> 3, 5+, 2, 1, 2, 4, 1, 1, 2, 2, 1, 4, 2, 1, 4, 2,…
## $ household_comp <ord> 2 Adults Kids, 2 Adults Kids, 2 Adults No Kids, …
## $ kids_count <ord> 1, 3+, 0, 0, 0, 3+, 0, 0, 0, 1, 0, 2, 1, 0, 2, 0…
## $ manufacturer_id <chr> "4421", "5072", "1102", "69", "972", "69", "69",…
## $ department <chr> "MEAT", "DRUG GM", "GROCERY", "MEAT-PCKGD", "GRO…
## $ brand <fct> National, National, National, Private, National,…
## $ product_category <chr> "BEEF", "FIRST AID PRODUCTS", "BAKED SWEET GOODS…
## $ product_type <chr> "ANGUS BEEF", "BANDAGE/TAPE", "SNACK CAKE - MULT…
## $ package_size <chr> NA, NA, "8 OZ", "12 OZ", "10 OZ", "12 PK", "16 O…
ggplot(newexample, aes(x = package_size, y = sales_value)) +
geom_point()

newexample %>%
ggplot(aes(x = package_size, y = sales_value, color = household_size)) +
geom_point()+
geom_smooth(method = "lm", se = FALSE, color = "set1") +
scale_color_brewer()
## `geom_smooth()` using formula = 'y ~ x'

newexample %>%
filter(sales_value < 20) %>%
ggplot(aes(x = sales_value, color = department, fill = department)) +
geom_density(alpha = .0004)
