library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Day 1
cus <- read_csv("noahs-customers.csv", show_col_types = F)
cus |>
mutate(
# Convert the name's letters to numbers
name_digits = str_replace_all(
tolower(name),
c("[abc]" = "2",
"[def]" = "3",
"[ghi]" = "4",
"[jkl]" = "5",
"[mno]" = "6",
"[pqrs]" = "7",
"[tuv]" = "8",
"[wxyz]" = "9")),
# Clean phone number
phone = str_remove_all(phone, "-")
) |>
# Cases where the phone number appears in the name digits
filter(str_detect(name_digits, phone)) |>
select(name, phone)
## # A tibble: 1 × 2
## name phone
## <chr> <chr>
## 1 Sam Guttenberg 4888362374
Day 2
# Join products, orders, items, and customers
pro <- read_csv("noahs-products.csv", show_col_types = F)
ord <- read_csv("noahs-orders.csv", show_col_types = F)
ite <- read_csv("noahs-orders_items.csv", show_col_types = F)
dat <- ord |>
left_join(ite, by = "orderid") |>
left_join(cus, by = "customerid") |>
left_join(pro, by = "sku")
# Find the rug cleaner's SKU in the products data
pro |> filter(desc |> tolower() |> str_detect("rug"))
## # A tibble: 3 × 3
## sku desc wholesale_cost
## <chr> <chr> <dbl>
## 1 BKY0455 Raspberry Rugelach 5.24
## 2 BKY6881 Poppyseed Rugelach 5.35
## 3 HOM8601 Rug Cleaner 3.51
# HOM8601 (Rug Cleaner)
# Find customers with initials JD who invoiced Nate for rug cleaner in 2017
dat |>
filter(
year(ordered) == 2017,
str_detect(name, "J.* D"),
sku == "HOM8601"
) |>
select(ordered, desc, name, phone)
## # A tibble: 6 × 4
## ordered desc name phone
## <dttm> <chr> <chr> <chr>
## 1 2017-04-05 12:49:41 Rug Cleaner Jeremy Davis 212-771-8924
## 2 2017-04-08 14:23:37 Rug Cleaner Jeremy Davis 212-771-8924
## 3 2017-04-11 13:11:40 Rug Cleaner Jeremy Davis 212-771-8924
## 4 2017-04-14 12:59:36 Rug Cleaner Jeremy Davis 212-771-8924
## 5 2017-04-17 14:39:59 Rug Cleaner Jeremy Davis 212-771-8924
## 6 2017-06-06 18:00:51 Rug Cleaner Jennifer Daniels 516-994-3023
Day 3
# Contractor's neighborhood?
cus |> filter(name == "Jeremy Davis") |> select(name, address, citystatezip)
## # A tibble: 1 × 3
## name address citystatezip
## <chr> <chr> <chr>
## 1 Jeremy Davis 134-10 Foch Blvd South Ozone Park, NY 11420
# South Ozone Park, NY
# Year of the dog
dog_year <- seq(2030, 1930, by = -12)
cus |>
filter(
# Born year of the dog
year(birthdate) %in% dog_year,
# During Aries
(month(birthdate) == 3 & day(birthdate) >= 21) |
(month(birthdate) == 4 & day(birthdate) <= 19),
# In the neighborhood?
str_detect(citystatezip, "Ozone")
) |>
select(name, birthdate, citystatezip, phone)
## # A tibble: 1 × 4
## name birthdate citystatezip phone
## <chr> <date> <chr> <chr>
## 1 Brent Nguyen 1958-03-25 South Ozone Park, NY 11420 516-636-7397
Day 4
# Find pastries - there was rugelach before!
pro |> filter(str_detect(tolower(desc), "rugelach"))
## # A tibble: 2 × 3
## sku desc wholesale_cost
## <chr> <chr> <dbl>
## 1 BKY0455 Raspberry Rugelach 5.24
## 2 BKY6881 Poppyseed Rugelach 5.35
rugelach_sku <- c("BKY0455", "BKY6881")
dat |>
filter(
# Got rugelach
sku %in% rugelach_sku,
# before 5 a.m.
hour(ordered) < 5,
# more than one!
qty > 1
) |>
select(ordered, desc, qty, name, phone)
## # A tibble: 1 × 5
## ordered desc qty name phone
## <dttm> <chr> <dbl> <chr> <chr>
## 1 2018-05-16 04:25:13 Raspberry Rugelach 2 Christina Booker 718-649-9036
Day 5
# What items are cat related?
cat_skus <- pro$sku[str_detect(tolower(pro$desc), "cat")]
# Who from Queens Village bought a lot of cat items?
dat |>
filter(
sku %in% cat_skus,
str_detect(tolower(citystatezip), "queens village"),
year(ordered) >= 2018
) |>
count(name, phone, sort = T)
## # A tibble: 4 × 3
## name phone n
## <chr> <chr> <int>
## 1 Anita Koch 315-492-7411 13
## 2 Sam House 607-836-2966 3
## 3 Jose Cook 516-745-9461 1
## 4 Justin Cochran 716-525-5173 1
Day 6
# Get sales - items sold for under wholesale price
dat |>
filter(unit_price < wholesale_cost) |>
# Who bought the most things on sale?
count(name, phone, sort = T)
## # A tibble: 1,619 × 3
## name phone n
## <chr> <chr> <int>
## 1 Emily Randolph 914-868-0316 26
## 2 Travis Bartlett 929-906-5980 8
## 3 Jonathan Williams 332-668-0158 6
## 4 Joshua Hansen 680-511-5677 6
## 5 Amanda Neal 716-966-2113 5
## 6 Debra Baldwin 607-211-2182 5
## 7 Dustin Wallace 631-505-6074 5
## 8 Morgan Smith 585-689-9118 5
## 9 Nicole Brooks 516-247-6087 5
## 10 Rebecca Johnson 516-254-0024 5
## # … with 1,609 more rows
Day 7
# Emily Randolph and some guy got the same item in different colors,
# around the same time.
pro |> filter(str_detect(desc, "red"))
## # A tibble: 14 × 3
## sku desc wholesale_cost
## <chr> <chr> <dbl>
## 1 HOM0018 Power Radio (red) 21.8
## 2 COL0295 Noah's Jersey (red) 15.4
## 3 HOM0835 Power Bicycle (red) 87.9
## 4 HOM0892 Automatic Widget (red) 5.83
## 5 HOM1296 Electric Toaster (red) 19.4
## 6 HOM2308 Manual Radio (red) 23.2
## 7 COL5914 Noah's Lunchbox (red) 19.4
## 8 HOM6145 Handmade Mixer (red) 218.
## 9 COL6628 Noah's Gift Box (red) 4.81
## 10 COL6708 Noah's Action Figure (red) 27.5
## 11 COL7794 Noah's Poster (red) 7.12
## 12 COL8456 Noah's Bobblehead (red) 8.48
## 13 HOM8473 Automatic Bicycle (red) 79.3
## 14 HOM8793 Automatic Machine (red) 38.6
# Colors are specified in ( )
dat |>
# Define day of purchace for grouping
mutate(date = as.Date(ordered)) |>
# Split the type of something off of the thing itself
separate(
desc,
into = c("item_name", "item_type"),
sep = " \\(",
fill = "right"
) |>
# Optional: clean the type
mutate(item_type = str_remove(item_type, "\\)$")) |>
# Restrict to items that have a type
filter(!is.na(item_type)) |>
# Restrict to day-item combinations with Emily Randolph and someone else
group_by(date, item_name) |>
filter(any(name == "Emily Randolph"), length(unique(name)) > 1 ) |>
# Display results
ungroup() |>
select(ordered, name, phone, item_name, item_type)
## # A tibble: 11 × 5
## ordered name phone item_name item_…¹
## <dttm> <chr> <chr> <chr> <chr>
## 1 2019-06-01 12:50:16 Emily Randolph 914-868-0316 Electric Machi… amber
## 2 2019-06-01 12:50:25 Jonathan Adams 315-618-5263 Electric Machi… purple
## 3 2019-08-18 12:47:14 Melvin Rodriguez III 914-698-1257 Electric Toast… mauve
## 4 2019-08-18 14:20:28 Emily Randolph 914-868-0316 Electric Toast… blue
## 5 2019-08-18 21:22:58 Dana Wright 838-284-6715 Electric Toast… blue
## 6 2021-04-17 11:14:30 Timothy Jones 716-691-6727 Noah's Jewelry green
## 7 2021-04-17 12:22:55 Frank Roach 516-520-9716 Noah's Jewelry yellow
## 8 2021-04-17 15:54:57 James Anderson 516-689-9580 Noah's Jewelry blue
## 9 2021-04-17 19:52:28 Emily Randolph 914-868-0316 Noah's Jewelry yellow
## 10 2022-07-26 08:18:05 Emily Randolph 914-868-0316 Noah's Jewelry orange
## 11 2022-07-26 12:05:26 Beth Mack 315-444-9913 Noah's Jewelry azure
## # … with abbreviated variable name ¹item_type
# Only purchase at the same time is Emily Randolph + Jonathan Adams
Day 8
# Has a full set of Noah's collectibles
dat |>
group_by(name, phone) |>
summarize(
n_collectible = sum(str_detect(desc, "^Noah's")),
.groups = "drop"
) |>
arrange(desc(n_collectible))
## # A tibble: 6,268 × 3
## name phone n_collectible
## <chr> <chr> <int>
## 1 Travis Bartlett 929-906-5980 230
## 2 Joshua Smith 914-243-3254 35
## 3 Christopher Bennett 516-686-9853 34
## 4 Megan Poole 585-479-7947 34
## 5 Justin Jimenez 516-722-4758 33
## 6 Chelsea Marsh 315-565-2714 30
## 7 Crystal Clark 315-296-8034 30
## 8 Joshua Parks 680-594-9693 30
## 9 Natasha Contreras 607-869-4191 30
## 10 Brendan Roberts 516-277-1533 29
## # … with 6,258 more rows