library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Day 1

cus <- read_csv("noahs-customers.csv", show_col_types = F)

cus |> 
  mutate(
    # Convert the name's letters to numbers
    name_digits = str_replace_all(
      tolower(name),
      c("[abc]" = "2",
        "[def]" = "3",
        "[ghi]" = "4",
        "[jkl]" = "5",
        "[mno]" = "6",
        "[pqrs]" = "7",
        "[tuv]" = "8",
        "[wxyz]" = "9")),
    # Clean phone number
    phone = str_remove_all(phone, "-")
  ) |> 
  # Cases where the phone number appears in the name digits
  filter(str_detect(name_digits, phone)) |> 
  select(name, phone)

## # A tibble: 1 × 2
##   name           phone     
##   <chr>          <chr>     
## 1 Sam Guttenberg 4888362374

Day 2

# Join products, orders, items, and customers
pro <- read_csv("noahs-products.csv", show_col_types = F)
ord <- read_csv("noahs-orders.csv", show_col_types = F)
ite <- read_csv("noahs-orders_items.csv", show_col_types = F)

dat <- ord |> 
  left_join(ite, by = "orderid") |> 
  left_join(cus, by = "customerid") |> 
  left_join(pro, by = "sku")

# Find the rug cleaner's SKU in the products data
pro |> filter(desc |> tolower() |> str_detect("rug"))

## # A tibble: 3 × 3
##   sku     desc               wholesale_cost
##   <chr>   <chr>                       <dbl>
## 1 BKY0455 Raspberry Rugelach           5.24
## 2 BKY6881 Poppyseed Rugelach           5.35
## 3 HOM8601 Rug Cleaner                  3.51

# HOM8601 (Rug Cleaner)

# Find customers with initials JD who invoiced Nate for rug cleaner in 2017
dat |> 
  filter(
    year(ordered) == 2017,
    str_detect(name, "J.* D"),
    sku == "HOM8601"
  ) |> 
  select(ordered, desc, name, phone)

## # A tibble: 6 × 4
##   ordered             desc        name             phone       
##   <dttm>              <chr>       <chr>            <chr>       
## 1 2017-04-05 12:49:41 Rug Cleaner Jeremy Davis     212-771-8924
## 2 2017-04-08 14:23:37 Rug Cleaner Jeremy Davis     212-771-8924
## 3 2017-04-11 13:11:40 Rug Cleaner Jeremy Davis     212-771-8924
## 4 2017-04-14 12:59:36 Rug Cleaner Jeremy Davis     212-771-8924
## 5 2017-04-17 14:39:59 Rug Cleaner Jeremy Davis     212-771-8924
## 6 2017-06-06 18:00:51 Rug Cleaner Jennifer Daniels 516-994-3023

Day 3

# Contractor's neighborhood? 
cus |> filter(name == "Jeremy Davis") |> select(name, address, citystatezip)

## # A tibble: 1 × 3
##   name         address          citystatezip              
##   <chr>        <chr>            <chr>                     
## 1 Jeremy Davis 134-10 Foch Blvd South Ozone Park, NY 11420

# South Ozone Park, NY

# Year of the dog
dog_year <- seq(2030, 1930, by = -12)

cus |> 
  filter(
    # Born year of the dog
    year(birthdate) %in% dog_year,
    # During Aries
    (month(birthdate) == 3 & day(birthdate) >= 21) |
      (month(birthdate) == 4 & day(birthdate) <= 19),
    # In the neighborhood?
    str_detect(citystatezip, "Ozone")
  ) |> 
  select(name, birthdate, citystatezip, phone)

## # A tibble: 1 × 4
##   name         birthdate  citystatezip               phone       
##   <chr>        <date>     <chr>                      <chr>       
## 1 Brent Nguyen 1958-03-25 South Ozone Park, NY 11420 516-636-7397

Day 4

# Find pastries - there was rugelach before!
pro |> filter(str_detect(tolower(desc), "rugelach"))

## # A tibble: 2 × 3
##   sku     desc               wholesale_cost
##   <chr>   <chr>                       <dbl>
## 1 BKY0455 Raspberry Rugelach           5.24
## 2 BKY6881 Poppyseed Rugelach           5.35

rugelach_sku <- c("BKY0455", "BKY6881")
dat |> 
  filter(
    # Got rugelach
    sku %in% rugelach_sku,
    # before 5 a.m.
    hour(ordered) < 5,
    # more than one!
    qty > 1
  ) |> 
  select(ordered, desc, qty, name, phone)

## # A tibble: 1 × 5
##   ordered             desc                 qty name             phone       
##   <dttm>              <chr>              <dbl> <chr>            <chr>       
## 1 2018-05-16 04:25:13 Raspberry Rugelach     2 Christina Booker 718-649-9036

Day 5

# What items are cat related?
cat_skus <- pro$sku[str_detect(tolower(pro$desc), "cat")]
# Who from Queens Village bought a lot of cat items?
dat |> 
  filter(
    sku %in% cat_skus,
    str_detect(tolower(citystatezip), "queens village"),
    year(ordered) >= 2018
  ) |> 
  count(name, phone, sort = T)

## # A tibble: 4 × 3
##   name           phone            n
##   <chr>          <chr>        <int>
## 1 Anita Koch     315-492-7411    13
## 2 Sam House      607-836-2966     3
## 3 Jose Cook      516-745-9461     1
## 4 Justin Cochran 716-525-5173     1

Day 6

# Get sales - items sold for under wholesale price
dat |> 
  filter(unit_price < wholesale_cost) |> 
  # Who bought the most things on sale?
  count(name, phone, sort = T)

## # A tibble: 1,619 × 3
##    name              phone            n
##    <chr>             <chr>        <int>
##  1 Emily Randolph    914-868-0316    26
##  2 Travis Bartlett   929-906-5980     8
##  3 Jonathan Williams 332-668-0158     6
##  4 Joshua Hansen     680-511-5677     6
##  5 Amanda Neal       716-966-2113     5
##  6 Debra Baldwin     607-211-2182     5
##  7 Dustin Wallace    631-505-6074     5
##  8 Morgan Smith      585-689-9118     5
##  9 Nicole Brooks     516-247-6087     5
## 10 Rebecca Johnson   516-254-0024     5
## # … with 1,609 more rows

Day 7

# Emily Randolph and some guy got the same item in different colors,
# around the same time.

pro |> filter(str_detect(desc, "red"))

## # A tibble: 14 × 3
##    sku     desc                       wholesale_cost
##    <chr>   <chr>                               <dbl>
##  1 HOM0018 Power Radio (red)                   21.8 
##  2 COL0295 Noah's Jersey (red)                 15.4 
##  3 HOM0835 Power Bicycle (red)                 87.9 
##  4 HOM0892 Automatic Widget (red)               5.83
##  5 HOM1296 Electric Toaster (red)              19.4 
##  6 HOM2308 Manual Radio (red)                  23.2 
##  7 COL5914 Noah's Lunchbox (red)               19.4 
##  8 HOM6145 Handmade Mixer (red)               218.  
##  9 COL6628 Noah's Gift Box (red)                4.81
## 10 COL6708 Noah's Action Figure (red)          27.5 
## 11 COL7794 Noah's Poster (red)                  7.12
## 12 COL8456 Noah's Bobblehead (red)              8.48
## 13 HOM8473 Automatic Bicycle (red)             79.3 
## 14 HOM8793 Automatic Machine (red)             38.6

# Colors are specified in ( )

dat |> 
  # Define day of purchace for grouping
  mutate(date = as.Date(ordered)) |>
  # Split the type of something off of the thing itself
  separate(
    desc, 
    into = c("item_name", "item_type"),
    sep = " \\(",
    fill = "right"
  ) |> 
  # Optional: clean the type
  mutate(item_type = str_remove(item_type, "\\)$")) |> 
  # Restrict to items that have a type
  filter(!is.na(item_type)) |> 
  # Restrict to day-item combinations with Emily Randolph and someone else
  group_by(date, item_name) |>
  filter(any(name == "Emily Randolph"), length(unique(name)) > 1 ) |> 
  # Display results
  ungroup() |> 
  select(ordered, name, phone, item_name, item_type)

## # A tibble: 11 × 5
##    ordered             name                 phone        item_name       item_…¹
##    <dttm>              <chr>                <chr>        <chr>           <chr>  
##  1 2019-06-01 12:50:16 Emily Randolph       914-868-0316 Electric Machi… amber  
##  2 2019-06-01 12:50:25 Jonathan Adams       315-618-5263 Electric Machi… purple 
##  3 2019-08-18 12:47:14 Melvin Rodriguez III 914-698-1257 Electric Toast… mauve  
##  4 2019-08-18 14:20:28 Emily Randolph       914-868-0316 Electric Toast… blue   
##  5 2019-08-18 21:22:58 Dana Wright          838-284-6715 Electric Toast… blue   
##  6 2021-04-17 11:14:30 Timothy Jones        716-691-6727 Noah's Jewelry  green  
##  7 2021-04-17 12:22:55 Frank Roach          516-520-9716 Noah's Jewelry  yellow 
##  8 2021-04-17 15:54:57 James Anderson       516-689-9580 Noah's Jewelry  blue   
##  9 2021-04-17 19:52:28 Emily Randolph       914-868-0316 Noah's Jewelry  yellow 
## 10 2022-07-26 08:18:05 Emily Randolph       914-868-0316 Noah's Jewelry  orange 
## 11 2022-07-26 12:05:26 Beth Mack            315-444-9913 Noah's Jewelry  azure  
## # … with abbreviated variable name ¹item_type

# Only purchase at the same time is Emily Randolph + Jonathan Adams

Day 8

# Has a full set of Noah's collectibles
dat |> 
  group_by(name, phone) |> 
  summarize(
    n_collectible = sum(str_detect(desc, "^Noah's")), 
    .groups = "drop"
  ) |> 
  arrange(desc(n_collectible))

## # A tibble: 6,268 × 3
##    name                phone        n_collectible
##    <chr>               <chr>                <int>
##  1 Travis Bartlett     929-906-5980           230
##  2 Joshua Smith        914-243-3254            35
##  3 Christopher Bennett 516-686-9853            34
##  4 Megan Poole         585-479-7947            34
##  5 Justin Jimenez      516-722-4758            33
##  6 Chelsea Marsh       315-565-2714            30
##  7 Crystal Clark       315-296-8034            30
##  8 Joshua Parks        680-594-9693            30
##  9 Natasha Contreras   607-869-4191            30
## 10 Brendan Roberts     516-277-1533            29
## # … with 6,258 more rows

Hanukkah of Data

2022-12-25

Day 1

Day 2

Day 3

Day 4

Day 5

Day 6

Day 7

Day 8