library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
Practical Application (4 points) Imagine you’re analyzing customer
behavior.
Which join would you use to find all customers, including those who
haven’t placed any orders? Why?
- I would use the left join because then all customers are included,
regardless of whether they’ve placed any orders.
Which join would you use to find only the customers who have placed
orders? Why?
- I would use the inner join function because excludes customers who
don’t have matching records in the orders table.
Write the R code for both scenarios.
practical1 <- left_join(orders, customers, by = "customer_id")
practical2 <-inner_join(orders, customers, by = "customer_id")
Display the result
head(practical1)
## # A tibble: 6 × 6
## order_id customer_id product amount name city
## <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 101 1 Laptop 1200 Alice New York
## 2 102 2 Phone 800 Bob Los Angeles
## 3 103 3 Tablet 300 Charlie Chicago
## 4 104 2 Desktop 1500 Bob Los Angeles
## 5 105 6 Camera 600 <NA> <NA>
## 6 106 7 Printer 150 <NA> <NA>
head(practical2)
## # A tibble: 4 × 6
## order_id customer_id product amount name city
## <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 101 1 Laptop 1200 Alice New York
## 2 102 2 Phone 800 Bob Los Angeles
## 3 103 3 Tablet 300 Charlie Chicago
## 4 104 2 Desktop 1500 Bob Los Angeles
Challenge Question (3 points) Create a summary that shows each
customer’s name, city, total number of orders, and total amount spent.
Include all customers, even those without orders. Hint: You’ll need to
use a combination of joins and group_by/summarize operations.
challenge <- summary <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(name, city) %>%
summarize(
total_orders = n(),
total_amount_spent = sum(amount, na.rm = TRUE)
)
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.