library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
Q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result?
The data has 4 rows
Why are some customers or orders not included in the
result?
Inner join returns where there is a match on both tables. The customer
data has only 3 customers in the order table where one customer has 2
orders.
Display the result
head(Q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
Q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result? The data has 6 rows
Explain why this number differs from the inner join result. The left join includes all customers even those without order information.
Display the result
head(Q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
Q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result? The data has 6 rows
Which customer_ids in the result have NULL for customer name and city? Explain why. The customer ids of 6 and 7 have NULL for customer name and city because those customer IDs are in the orders dataset but they are not included in the customers dataset.
Display the result
head(Q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
Q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result? The data has 8 rows
Identify any rows where there’s information from only one table. Explain these results. In the customers table, customer IDs of 4 and 5 have information from only one table (customers) because they have not placed any orders. In the orders table, customer IDs 6 and 7 have information from only one table (orders) because they are not in the customers table.
Display the result
head(Q4)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
Q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result? The data has 3 rows
How does this result differ from the inner join result? The results from the semi join returns rows from customers with matches in orders, showing only customer data. While the inner join returns rows with matches in both datasets, including all relevant data from both customers and orders.
Display the result
head(Q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
Q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`
Q6 %>%
select(name)
## # A tibble: 2 × 1
## name
## <chr>
## 1 David
## 2 Eve
Explain what this result tells you about these customers. These customers are part of the customers dataset but have not made any purchases reflected in the orders dataset.
Display the result
head(Q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
Which join would you use to find all customers, including those who haven’t placed any orders? Why? I would use the full join because it would combine the customer dataset and the orders dataset and return all rows from both datasets, so customers who have placed orders with their order detials and customers who haven’t placed orders with NA displayed in the order columns.
Which join would you use to find only the customers who have placed orders? Why? I would use the right join because then I will get a list of customers along with their order details for customers who actually placed orders.
Write the R code for both scenarios.
Q7A <- full_join(customers, orders, by = "customer_id")
Q7B <- right_join(customers, orders, by = "customer_id")
head(Q7A)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
head(Q7B)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
EC <- full_join(customers, orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(
total_orders = sum(!is.na(order_id)),
total_amount_spent = sum(amount, na.rm = TRUE)
) %>%
arrange(name)
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.