library(tidyverse)
library(completejourney)
search()
##  [1] ".GlobalEnv"              "package:completejourney"
##  [3] "package:lubridate"       "package:forcats"        
##  [5] "package:stringr"         "package:dplyr"          
##  [7] "package:purrr"           "package:readr"          
##  [9] "package:tidyr"           "package:tibble"         
## [11] "package:ggplot2"         "package:tidyverse"      
## [13] "package:stats"           "package:graphics"       
## [15] "package:grDevices"       "package:utils"          
## [17] "package:datasets"        "package:methods"        
## [19] "Autoloads"               "package:base"
ls("package:completejourney")
##  [1] "%<-%"                  "%>%"                   "campaign_descriptions"
##  [4] "campaigns"             "coupon_redemptions"    "coupons"              
##  [7] "demographics"          "get_data"              "get_promotions"       
## [10] "get_transactions"      "products"              "promotions_sample"    
## [13] "transactions_sample"
transactions <- get_transactions()
glimpse(transactions)
## Rows: 1,469,307
## Columns: 11
## $ household_id          <chr> "900", "900", "1228", "906", "906", "906", "906"…
## $ store_id              <chr> "330", "330", "406", "319", "319", "319", "319",…
## $ basket_id             <chr> "31198570044", "31198570047", "31198655051", "31…
## $ product_id            <chr> "1095275", "9878513", "1041453", "1020156", "105…
## $ quantity              <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ sales_value           <dbl> 0.50, 0.99, 1.43, 1.50, 2.78, 5.49, 1.50, 1.88, …
## $ retail_disc           <dbl> 0.00, 0.10, 0.15, 0.29, 0.80, 0.50, 0.29, 0.21, …
## $ coupon_disc           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week                  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ transaction_timestamp <dttm> 2017-01-01 06:53:26, 2017-01-01 07:10:28, 2017-…
dim(transactions)
## [1] 1469307      11
sum(is.na(transactions))
## [1] 0
colSums(is.na(transactions))
##          household_id              store_id             basket_id 
##                     0                     0                     0 
##            product_id              quantity           sales_value 
##                     0                     0                     0 
##           retail_disc           coupon_disc     coupon_match_disc 
##                     0                     0                     0 
##                  week transaction_timestamp 
##                     0                     0

The transactions dataset contains 1,469,307 rows and 11 columns. It includes information about households, stores, products, quantities purchased, sales values, discounts, and the time of each transaction.

summary(transactions$sales_value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.290   2.000   3.128   3.490 840.000

The sales value is right-skewed. Most transactions have small sales amounts, but a few very large purchases increase the mean above the median.

summary(transactions$quantity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     1.0     1.0   104.1     1.0 89638.0

The quantity variable is strongly right-skewed. The median quantity purchased is 1 item, but the mean is much higher because a small number of transactions include very large quantities.

transactions_multi <- transactions %>%
  filter(quantity > 1)

dim(transactions_multi)
## [1] 302909     11

Filtering the dataset to transactions with a quantity greater than 1 reduces the number of observations, showing that most transactions involve purchasing only one item.

transactions %>%
  summarize(total_quantity = sum(quantity))
## # A tibble: 1 × 1
##   total_quantity
##            <dbl>
## 1      152923108

The total quantity purchased across all transactions is very large, reflecting the high volume of transactions in the dataset.

transactions %>%
  group_by(household_id) %>%
  summarize(total_quantity = sum(quantity)) %>%
  head()
## # A tibble: 6 × 2
##   household_id total_quantity
##   <chr>                 <dbl>
## 1 1                      1131
## 2 10                        7
## 3 100                  122732
## 4 1000                  30739
## 5 1001                   1088
## 6 1002                    250

Grouping by household shows that total quantities purchased vary widely across households, indicating different shopping behaviors.

transactions_store309 <- transactions %>%
  filter(store_id == "309", quantity > 1)

dim(transactions_store309)
## [1] 1327   11

Filtering to store 309 and transactions with quantity greater than one further reduces the dataset, showing that larger purchases at a single store are less common.

transactions %>%
  arrange(desc(sales_value)) %>%
  select(household_id, store_id, quantity, sales_value) %>%
  head()
## # A tibble: 6 × 4
##   household_id store_id quantity sales_value
##   <chr>        <chr>       <dbl>       <dbl>
## 1 1609         412             3        840 
## 2 346          415             5        632.
## 3 374          361             1        500.
## 4 1109         356             1        330.
## 5 2191         343             1        300.
## 6 1064         365             1        300.

Sorting by sales value shows a small number of transactions with unusually large purchase amounts compared to the majority of transactions.

transactions %>%
  group_by(store_id) %>%
  summarize(total_sales = sum(sales_value)) %>%
  arrange(desc(total_sales)) %>%
  head()
## # A tibble: 6 × 2
##   store_id total_sales
##   <chr>          <dbl>
## 1 367          148170.
## 2 406          120904.
## 3 429           86884.
## 4 361           81196.
## 5 343           80701.
## 6 356           78874.

Grouping by store reveals that total sales vary substantially across stores, with a small number of stores accounting for the highest total sales.

transactions %>%
  distinct(household_id) %>%
  nrow()
## [1] 2469

This shows how many unique households are represented in the transactions dataset. 2,469.

set.seed(123)

transactions_sample <- transactions %>%
  sample_frac(0.10)

dim(transactions_sample)
## [1] 146931     11

A 10% sample was created to make the dataset more manageable while preserving its overall structure.

transactions %>%
  slice(1:10)
## # A tibble: 10 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 900          330      31198570044 1095275           1        0.5         0   
##  2 900          330      31198570047 9878513           1        0.99        0.1 
##  3 1228         406      31198655051 1041453           1        1.43        0.15
##  4 906          319      31198705046 1020156           1        1.5         0.29
##  5 906          319      31198705046 1053875           2        2.78        0.8 
##  6 906          319      31198705046 1060312           1        5.49        0.5 
##  7 906          319      31198705046 1075313           1        1.5         0.29
##  8 1058         381      31198676055 985893            1        1.88        0.21
##  9 1058         381      31198676055 988791            1        1.5         1.29
## 10 1058         381      31198676055 9297106           1        2.69        0   
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Slicing the data allows us to inspect specific rows to better understand the structure and values in the dataset.

transactions %>%
  top_n(5, wt = sales_value) %>%
  select(household_id, store_id, quantity, sales_value)
## # A tibble: 6 × 4
##   household_id store_id quantity sales_value
##   <chr>        <chr>       <dbl>       <dbl>
## 1 1609         412             3        840 
## 2 374          361             1        500.
## 3 2191         343             1        300.
## 4 1109         356             1        330.
## 5 1064         365             1        300.
## 6 346          415             5        632.

Selecting the top transactions by sales value highlights extreme purchase amounts compared to typical transactions.