library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
1. Inner Join (3 points)
Question1 <- inner_join(customers, orders, by = "customer_id")
# a. 4 rows
nrow(Question1)
## [1] 4
# b. Inner join only returns when there is match in both tables. Both tables only match 3 customers and 1 customer has 2 orders
# c.
head(Question1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
2. Left Join (3 points)
Question2 <- left_join(customers, orders, by = "customer_id")
# a. 6 rows
nrow(Question2)
## [1] 6
# b. This is different from inner join because all customers from the table are included,
# no matter the data they have
# c.
head(Question2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
3. Right Join (3 points)
Question3 <- right_join(customers, orders, by = "customer_id")
# a. 6 rows
nrow(Question3)
## [1] 6
# b. customer_id 6 and 7 have a null because they dont have name or city data. This is because
# we take the data from the order table and match it to whatever we have from customers,
# even if there's no data
# c.
head(Question3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
4. Full Join (3 points)
Question4 <- full_join(customers, orders, by = "customer_id")
# a. 8 rows
nrow(Question4)
## [1] 8
# b. Rows 5-8 have some data that is not in the table. Rows 5-6 have data from customers but not
# from orders. Rows 7-8 have data from the orders table but not from customers
# c.
head(Question4)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
5. Semi Join (3 points)
Question5 <- semi_join(customers, orders, by = "customer_id")
# a. 3 rows
nrow(Question5)
## [1] 3
# b. This differs from inner join because semi join only returns rows that are strictly in the
# customers table. Even though Bob has 2 customer_ids in the orders table, we are only
# retuning his row once, along with customer_ids 1 and 3
# c.
head(Question5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
6. Anti Join (3 points)
Question6 <- anti_join(customers, orders, by = "customer_id")
# a. David & Eve
Question6[2]
## # A tibble: 2 × 1
## name
## <chr>
## 1 David
## 2 Eve
# b. Anti join tells is about data that doesn't match up in both tables. Because the
# customer table comes first, it will display the cusotmers where the customer_id does not
# show up in the order table. In this case customer_id 4 and 5 are not in orders table, so
# that is why we see David and Eve displayed
#c.
head(Question6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
7. Practical Application (4 points)
# a. To find all customers, including those who haven't placed any orders, you
# would use a left_join. You would do this because it returns all rows
# from the customers table and the matching rows from the orders table.
# b. To find only the customers who have placed orders, you would use an
# inner_join. An inner_join would be perfect here because, an inner_join
# returns only rows which have a match in both the customers and orders
# data frames.
# c.
PracticalApplication1 <- result_left_join <- customers %>%
left_join(orders, by = "customer_id")
PracticalApplication2 <- result_inner_join <- customers %>%
inner_join(orders, by = "customer_id")
# d.
head(PracticalApplication1)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
head(PracticalApplication2)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
8. Challenge Question (3 points)
# Left join to ensure all customers are included
result <- customers %>%
left_join(orders, by = "customer_id") %>%
# Group by customer_id, customer_name, and city
group_by(customer_id, name, city) %>%
# Summarize: count number of orders and sum order amounts
summarize(total_orders = n_distinct(order_id, na.rm = TRUE),
total_amount_spent = sum(amount, na.rm = TRUE),
.groups = 'drop')