For this project, I am investigating if beer is often bought at the same time as pizza Regork should be interested in answering this question so that they can keep an ample supply of both beer and pizza so they can be bought together
library(tidyverse) # Streamlines data analysis for easier manipulation, visualization, etc.
library(completejourney) # Provides information about the Regork company for analysis
library(dplyr) # Ensures dplyr and pipe operators are available
library(ggplot2) # Data visualization
suppressPackageStartupMessages({ library(tidyverse) library(ggplot2) library(completejourney) library(dplyr) })
#DATA
data <- completejourney::get_transactions()
products <- completejourney::products
# Check if products data loaded
if(exists("products") && is.data.frame(products)) {
message("products dataset successfully loaded")
message("Number of rows in products: ", nrow(products))
} else {
stop("products dataset not found")
}
## products dataset successfully loaded
## Number of rows in products: 92331
# Check column names in products to find the correct field name
message("Columns in products: ", paste(colnames(products), collapse=", "))
## Columns in products: product_id, manufacturer_id, department, brand, product_category, product_type, package_size
# Peek at the first few rows of products
dplyr::glimpse(products)
## Rows: 92,331
## Columns: 7
## $ product_id <chr> "25671", "26081", "26093", "26190", "26355", "26426",…
## $ manufacturer_id <chr> "2", "2", "69", "69", "69", "69", "69", "69", "69", "…
## $ department <chr> "GROCERY", "MISCELLANEOUS", "PASTRY", "GROCERY", "GRO…
## $ brand <fct> National, National, Private, Private, Private, Privat…
## $ product_category <chr> "FRZN ICE", NA, "BREAD", "FRUIT - SHELF STABLE", "COO…
## $ product_type <chr> "ICE - CRUSHED/CUBED", NA, "BREAD:ITALIAN/FRENCH", "A…
## $ package_size <chr> "22 LB", NA, NA, "50 OZ", "14 OZ", "2.5 OZ", "16 OZ",…
# Confirm product_id alignment
if("product_id" %in% colnames(products)) {
message("product_id column found in products")
} else {
stop("product_id column not found in products")
}
## product_id column found in products
# Confirm product_category exists in products
test_product_category <- "product_category" %in% colnames(products)
message("product_category column in products: ", test_product_category)
## product_category column in products: TRUE
# Join data and products
if("product_id" %in% colnames(data) && "product_id" %in% colnames(products)) {
data <- dplyr::left_join(data, products, by = "product_id")
} else {
stop("product_id column missing in one of the datasets")
}
# Confirm the joined data structure
message("Columns in joined data: ", paste(colnames(data), collapse=", "))
## Columns in joined data: household_id, store_id, basket_id, product_id, quantity, sales_value, retail_disc, coupon_disc, coupon_match_disc, week, transaction_timestamp, manufacturer_id, department, brand, product_category, product_type, package_size
dplyr::glimpse(data)
## Rows: 1,469,307
## Columns: 17
## $ household_id <chr> "900", "900", "1228", "906", "906", "906", "906"…
## $ store_id <chr> "330", "330", "406", "319", "319", "319", "319",…
## $ basket_id <chr> "31198570044", "31198570047", "31198655051", "31…
## $ product_id <chr> "1095275", "9878513", "1041453", "1020156", "105…
## $ quantity <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ sales_value <dbl> 0.50, 0.99, 1.43, 1.50, 2.78, 5.49, 1.50, 1.88, …
## $ retail_disc <dbl> 0.00, 0.10, 0.15, 0.29, 0.80, 0.50, 0.29, 0.21, …
## $ coupon_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ transaction_timestamp <dttm> 2017-01-01 06:53:26, 2017-01-01 07:10:28, 2017-…
## $ manufacturer_id <chr> "2", "69", "69", "2142", "2326", "608", "2326", …
## $ department <chr> "PASTRY", "GROCERY", "GROCERY", "GROCERY", "GROC…
## $ brand <fct> National, Private, Private, National, National, …
## $ product_category <chr> "ROLLS", "FACIAL TISS/DNR NAPKIN", "BAG SNACKS",…
## $ product_type <chr> "ROLLS: BAGELS", "FACIAL TISSUE & PAPER HANDKE",…
## $ package_size <chr> "4 OZ", "85 CT", "11.5 OZ", "17.1 OZ", "5.0 OZ",…
# Check unique product categories if available
if("product_category" %in% colnames(data)) {
unique_categories <- unique(data$product_category)
unique_categories <- unique_categories[!is.na(unique_categories)] # Remove NA values
message("Unique product categories: ", paste(unique_categories, collapse=", "))
# Check for presence of FROZEN PIZZA and BEERS/ALES products
message("Checking for FROZEN PIZZA and BEERS/ALES products...")
pizza_count <- sum(data$product_category == "FROZEN PIZZA", na.rm = TRUE)
beer_count <- sum(data$product_category == "BEERS/ALES", na.rm = TRUE)
message("Number of FROZEN PIZZA products: ", pizza_count)
message("Number of BEERS/ALES products: ", beer_count)
} else {
message("product_category not found in data")
}
## Unique product categories: ROLLS, FACIAL TISS/DNR NAPKIN, BAG SNACKS, REFRGRATD DOUGH PRODUCTS, SEAFOOD - SHELF STABLE, FRZN MEAT/MEAT DINNERS, SOFT DRINKS, CRACKERS/MISC BKD FD, FRZN BREAKFAST FOODS, REFRGRATD JUICES/DRNKS, BAKED SWEET GOODS, FLUID MILK PRODUCTS, POTATOES, TROPICAL FRUIT, NEWSPAPER, VITAMINS, CARROTS, ONIONS, HISPANIC, CHEESE, FRZN JCE CONC/DRNKS, EGGS, BREAKFAST SWEETS, CANNED JUICES, PAPER TOWELS, PICKLE/RELISH/PKLD VEG, COUPON/MISC ITEMS, FEMININE HYGIENE, CIGARETTES, BREAD, HERBS, BEANS - CANNED GLASS & MW, COFFEE, BAKING MIXES, FRUIT - SHELF STABLE, ICE CREAM/MILK/SHERBTS, BEEF, BAKED BREAD/BUNS/ROLLS, DINNER SAUSAGE, FRZN POTATOES, CHEESES, TOMATOES, MARGARINES, COLD CEREAL, LUNCHMEAT, MILK BY-PRODUCTS, AIR CARE, CANDY - PACKAGED, VEGETABLES - ALL OTHERS, PORK, REFRIGERATED, FROZEN, CONDIMENTS/SAUCES, MEAT - SHELF STABLE, CHICKEN, FOOT CARE PRODUCTS, LAXATIVES, DELI MEATS, DRY NOODLES/PASTA, SALD DRSNG/SNDWCH SPRD, SWEET GOODS & SNACKS, HOT DOGS, BREAKFAST SAUSAGE/SANDWICHES, SMOKED MEATS, HOUSEHOLD CLEANG NEEDS, APPAREL, BUTTER, FROZEN PIE/DESSERTS, DRY BN/VEG/POTATO/RICE, PROCESSED, WATER - CARBONATED/FLVRD DRINK, SEAFOOD - FROZEN, COLD AND FLU, FIRST AID PRODUCTS, YOGURT, EYE AND EAR CARE PRODUCTS, COOKIES/CONES, HAIR CARE PRODUCTS, ANTACIDS, DINNER MXS:DRY, PWDR/CRYSTL DRNK MX, MEAT - MISC, SHORTENING/OIL, CITRUS, SUGARS/SWEETNERS, FRZN VEGETABLE/VEG DSH, DOG FOODS, APPLES, BACON, SOAP - LIQUID & BAR, ANALGESICS, KITCHEN GADGETS, SALAD MIX, CEREAL/BREAKFAST, HOT CEREAL, FROZEN BREAD/DOUGH, FROZEN PIZZA, VEGETABLES - SHELF STABLE, DIAPERS & DISPOSABLES, FROZEN MEAT, DRY MIX DESSERTS, MAKEUP AND TREATMENT, CONVENIENT BRKFST/WHLSM SNACKS, SOUP, BAKING NEEDS, SHAVING CARE PRODUCTS, BATH TISSUES, SPICES & EXTRACTS, MISCELLANEOUS, FRZN ICE, BEERS/ALES, FD WRAPS/BAGS/TRSH BG, STATIONERY & SCHOOL SUPPLIES, BROCCOLI/CAULIFLOWER, VEGETABLES SALAD, ORGANICS FRUIT & VEGETABLES, BABY HBC, MUSHROOMS, CAT LITTER, DEODORANTS, CAT FOOD, FRZN NOVELTIES/WTR ICE, HAIR CARE ACCESSORIES, LAUNDRY DETERGENTS, HEAT/SERVE, DRY SAUCES/GRAVY, COOKIES, CHRISTMAS SEASONAL, BROOMS AND MOPS, SALAD BAR, PARTY TRAYS, CHICKEN/POULTRY, SINUS AND ALLERGY, PAPER HOUSEWARES, FROZEN - BOXED(GROCERY), TEAS, PLASTIC HOUSEWARES, COFFEE FILTERS, MAGAZINE, EXOTIC GAME/FOWL, DISHWASH DETERGENTS, PREPARED FOOD, CANDY - CHECKLANE, PEPPERS-ALL, SALADS/DIPS, VALUE ADDED FRUIT, VALUE ADDED VEGETABLES, CHIPS&SNACKS, LAUNDRY ADDITIVES, GREETING CARDS/WRAP/PARTY SPLY, COUPONS/STORE & MFG, NEW AGE, ORAL HYGIENE PRODUCTS, JUICE, PROPANE, DRY TEA/COFFEE/COCO MIX, MISC. DAIRY, MOLASSES/SYRUP/PANCAKE MIXS, WAREHOUSE SNACKS, BERRIES, FLOUR & MEALS, TURKEY, WATER, BABY FOODS, INFANT FORMULA, COCOA MIXES, CANDLES/ACCESSORIES, PET CARE SUPPLIES, DOMESTIC GOODS, PNT BTR/JELLY/JAMS, DOMESTIC WINE, MISC WINE, INFANT CARE PRODUCTS, SEAFOOD - MISC, ISOTONIC DRINKS, STONE FRUIT, PASTA SAUCE, BOOKSTORE, OLIVES, DELI SPECIALTIES (RETAIL PK), PIES, SANDWICHES, DIETARY AID PRODUCTS, DISPOSIBLE FOILWARE, CANNED MILK, NUTS, HAND/BODY/FACIAL PRODUCTS, BLEACH, CHARCOAL AND LIGHTER FLUID, GRAPES, COOKWARE & BAKEWARE, ELECTRICAL SUPPPLIES, ETHNIC PERSONAL CARE, PEARS, TOYS AND GAMES, MELONS, SNACKS, SEWING, NON-DAIRY BEVERAGES, DRIED FRUIT, SQUASH, FILM AND CAMERA PRODUCTS, IN-STORE PHOTOFINISHING, SNACK NUTS, POPCORN, CONDIMENTS, SEAFOOD-FRESH, FITNESS&DIET, DELI SUPPLIES, FUEL, ROSES, DOLLAR VALUE PRODUCTS, CAKES, RICE CAKES, PORTABLE ELECTRIC APPLIANCES, IMPORTED WINE, SYRUPS/TOPPINGS, SHOE CARE, LIQUOR, PREPAID WIRELESS&ACCESSORIES, AUDIO/VIDEO PRODUCTS, FAMILY PLANNING, BATTERIES, FRZN FRUITS, SNKS/CKYS/CRKR/CNDY, TOBACCO OTHER, AUTOMOTIVE PRODUCTS, HOSIERY/SOCKS, RW FRESH PROCESSED MEAT, TICKETS, FLORAL-FLOWERING PLANTS, COUPON, BATH, ADULT INCONTINENCE, SUNTAN, FLORAL-FRESH CUT, IRONING AND CHEMICALS, NON EDIBLE PRODUCTS, BAKERY PARTY TRAYS, SUSHI, PREPARED/PKGD FOODS, HALLOWEEN, SPRING/SUMMER SEASONAL, HARDWARE SUPPLIES, BIRD SEED, BAKING, SPORTS MEMORABLILIA, COSMETIC ACCESSORIES, VEAL, J-HOOKS, INSECTICIDES, FALL AND WINTER SEASONAL, BABYFOOD, PACKAGED NATURAL SNACKS, BEVERAGE, GLASSWARE & DINNERWARE, FRZN SEAFOOD, FROZEN CHICKEN, FLORAL BALLOONS, HOME FREEZING & CANNING SUPPLY, NATURAL HBC, GLASSES/VISION AIDS, BOTTLE DEPOSITS, FLORAL- HARD GOODS, SERVICE BEVERAGE, COFFEE SHOP, FLORAL-FOLIAGE PLANTS, SEASONAL, VALENTINE, PERSONAL CARE APPLIANCES, MEAT SUPPLIES, COFFEE SHOP SWEET GOODS&RETAIL, CIGARS, LAMB, QUICK SERVICE, LONG DISTANCE CALLING CARDS, EASTER, WATCHES/CALCULATORS/LOBBY, FRAGRANCES, OVERNIGHT PHOTOFINISHING, SMOKING CESSATIONS, FLORAL-ACCESSORIES, CONTINUITIES, HOME FURNISHINGS, CORN, NATURAL VITAMINS, GIFT & FRUIT BASKETS, LAWN AND GARDEN SHOP, RESTRICTED DIET, MISCELLANEOUS HBC, GARDEN CENTER, PROD SUPPLIES, PKG.SEAFOOD MISC, HOME HEALTH CARE, TOYS, PHARMACY, EASTER LILY, FIREWORKS, MISCELLANEOUS CROUTONS, BULK FOODS, BOUQUET (NON ROSE), NDAIRY/TEAS/JUICE/SOD
## Checking for FROZEN PIZZA and BEERS/ALES products...
## Number of FROZEN PIZZA products: 23998
## Number of BEERS/ALES products: 9943
# Check structure of data after join
message("Structure of data after join:")
## Structure of data after join:
str(data)
## tibble [1,469,307 × 17] (S3: tbl_df/tbl/data.frame)
## $ household_id : chr [1:1469307] "900" "900" "1228" "906" ...
## $ store_id : chr [1:1469307] "330" "330" "406" "319" ...
## $ basket_id : chr [1:1469307] "31198570044" "31198570047" "31198655051" "31198705046" ...
## $ product_id : chr [1:1469307] "1095275" "9878513" "1041453" "1020156" ...
## $ quantity : num [1:1469307] 1 1 1 1 2 1 1 1 1 1 ...
## $ sales_value : num [1:1469307] 0.5 0.99 1.43 1.5 2.78 5.49 1.5 1.88 1.5 2.69 ...
## $ retail_disc : num [1:1469307] 0 0.1 0.15 0.29 0.8 0.5 0.29 0.21 1.29 0 ...
## $ coupon_disc : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
## $ coupon_match_disc : num [1:1469307] 0 0 0 0 0 0 0 0 0 0 ...
## $ week : int [1:1469307] 1 1 1 1 1 1 1 1 1 1 ...
## $ transaction_timestamp: POSIXct[1:1469307], format: "2017-01-01 06:53:26" "2017-01-01 07:10:28" ...
## $ manufacturer_id : chr [1:1469307] "2" "69" "69" "2142" ...
## $ department : chr [1:1469307] "PASTRY" "GROCERY" "GROCERY" "GROCERY" ...
## $ brand : Factor w/ 2 levels "National","Private": 1 2 2 1 1 1 1 2 1 1 ...
## $ product_category : chr [1:1469307] "ROLLS" "FACIAL TISS/DNR NAPKIN" "BAG SNACKS" "REFRGRATD DOUGH PRODUCTS" ...
## $ product_type : chr [1:1469307] "ROLLS: BAGELS" "FACIAL TISSUE & PAPER HANDKE" "POTATO CHIPS" "REFRIGERATED BAGELS" ...
## $ package_size : chr [1:1469307] "4 OZ" "85 CT" "11.5 OZ" "17.1 OZ" ...
# Make sure product_category column is populated
if("product_category" %in% colnames(data)) {
message("Non-NA product_category values: ", sum(!is.na(data$product_category)))
}
## Non-NA product_category values: 1462262
# Make sure product_category exists after the join
available_columns <- colnames(data)
message("Available columns: ", paste(available_columns, collapse=", "))
## Available columns: household_id, store_id, basket_id, product_id, quantity, sales_value, retail_disc, coupon_disc, coupon_match_disc, week, transaction_timestamp, manufacturer_id, department, brand, product_category, product_type, package_size
if("product_category" %in% available_columns) {
pizza_beer <- data |>
dplyr::filter(product_category %in% c("FROZEN PIZZA", "BEERS/ALES"))
message("Rows in pizza_beer: ", nrow(pizza_beer))
dplyr::glimpse(pizza_beer)
# Check product categories in pizza_beer
if(nrow(pizza_beer) > 0) {
message("Unique product categories in pizza_beer: ", paste(unique(pizza_beer$product_category), collapse=", "))
} else {
message("pizza_beer dataset is empty")
}
} else {
stop("product_category not found in data. Check column names above.")
}
## Rows in pizza_beer: 33941
## Rows: 33,941
## Columns: 17
## $ household_id <chr> "1899", "1899", "434", "2004", "1765", "1445", "…
## $ store_id <chr> "31782", "31782", "381", "31782", "346", "410", …
## $ basket_id <chr> "31198500220", "31198500220", "31198676474", "31…
## $ product_id <chr> "1043494", "1081470", "978367", "890695", "94185…
## $ quantity <dbl> 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, …
## $ sales_value <dbl> 3.49, 2.00, 2.97, 2.00, 3.49, 15.99, 2.99, 9.49,…
## $ retail_disc <dbl> 0.30, 1.79, 0.00, 0.69, 0.30, 0.00, 0.00, 0.00, …
## $ coupon_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ coupon_match_disc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ week <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ transaction_timestamp <dttm> 2017-01-01 10:48:12, 2017-01-01 10:48:12, 2017-…
## $ manufacturer_id <chr> "1039", "608", "697", "988", "1039", "697", "151…
## $ department <chr> "GROCERY", "GROCERY", "GROCERY", "GROCERY", "GRO…
## $ brand <fct> National, National, National, National, National…
## $ product_category <chr> "FROZEN PIZZA", "FROZEN PIZZA", "BEERS/ALES", "F…
## $ product_type <chr> "SNACKS/APPETIZERS", "SNACKS/APPETIZERS", "BEERA…
## $ package_size <chr> NA, "13.5 OZ", "24 OZ", "13 OZ", NA, "12 OZ", "1…
## Unique product categories in pizza_beer: FROZEN PIZZA, BEERS/ALES
# Make sure pizza_beer dataset is not empty
if(exists("pizza_beer") && nrow(pizza_beer) > 0) {
message("pizza_beer dataset created successfully with ", nrow(pizza_beer), " rows")
} else {
stop("pizza_beer dataset not created or empty")
}
## pizza_beer dataset created successfully with 33941 rows
if(exists("pizza_beer") && nrow(pizza_beer) > 0) {
pizza_beer_summary <- pizza_beer |>
dplyr::group_by(basket_id) |>
dplyr::summarize(pizza = any(product_category == "FROZEN PIZZA"),
beer = any(product_category == "BEERS/ALES"))
co_purchase <- pizza_beer_summary |>
dplyr::filter(pizza == TRUE & beer == TRUE)
message("Number of co-purchase baskets: ", nrow(co_purchase))
# Print the contents of pizza_beer_summary
message("Summary of pizza_beer_summary:")
print(table(pizza_beer_summary$pizza, pizza_beer_summary$beer))
} else {
message("pizza_beer dataset not available or empty")
}
## Number of co-purchase baskets: 1023
## Summary of pizza_beer_summary:
##
## FALSE TRUE
## FALSE 0 7561
## TRUE 11884 1023
# Makse sure pizza_beer_summary dataset is created
if(exists("pizza_beer_summary") && nrow(pizza_beer_summary) > 0) {
message("pizza_beer_summary dataset created successfully with ", nrow(pizza_beer_summary), " rows")
} else {
stop("pizza_beer_summary dataset not created or empty")
}
## pizza_beer_summary dataset created successfully with 20468 rows
if(exists("pizza_beer_summary") && nrow(pizza_beer_summary) > 0) {
co_purchase_plot <- pizza_beer_summary |>
dplyr::count(pizza, beer) |>
ggplot2::ggplot(ggplot2::aes(x = pizza, y = beer, fill = n)) +
ggplot2::geom_tile() +
ggplot2::labs(title = "Co-Purchase of Pizza and Beer",
x = "Pizza Purchased",
y = "Beer Purchased",
fill = "Count")
co_purchase_plot
} else {
message("pizza_beer_summary dataset not available or empty")
}