library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

1. Inner Join (3 points)

Question1 <- inner_join(customers, orders, by = "customer_id")
# a. 4 rows 
nrow(Question1)
## [1] 4
# b. Inner join only returns when there is match in both tables. Both tables only match 3 customers and 1 customer has 2 orders
# c.
head(Question1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

2. Left Join (3 points)

Question2 <- left_join(customers, orders, by = "customer_id")
# a. 6 rows
nrow(Question2)
## [1] 6
# b. This is different from inner join because all customers from the table are included, 
#   no matter the data they have
# c.
head(Question2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

3. Right Join (3 points)

Question3 <- right_join(customers, orders, by = "customer_id")
# a. 6 rows
nrow(Question3)
## [1] 6
# b. customer_id 6 and 7 have a null because they dont have name or city data. This is because 
#   we take the data from the order table and match it to whatever we have from customers, 
#   even if there's no data
# c. 
head(Question3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

4. Full Join (3 points)

Question4 <- full_join(customers, orders, by = "customer_id")
# a. 8 rows
nrow(Question4)
## [1] 8
# b. Rows 5-8 have some data that is not in the table. Rows 5-6 have data from customers but not
# from orders. Rows 7-8 have data from the orders table but not from customers
# c. 
head(Question4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

5. Semi Join (3 points)

Question5 <- semi_join(customers, orders, by = "customer_id")
# a. 3 rows
nrow(Question5)
## [1] 3
# b. This differs from inner join because semi join only returns rows that are strictly in the 
# customers table. Even though Bob has 2 customer_ids in the orders table, we are only
# retuning his row once, along with customer_ids 1 and 3
# c. 
head(Question5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

6. Anti Join (3 points)

Question6 <- anti_join(customers, orders, by = "customer_id")
# a. David & Eve
Question6[2]
## # A tibble: 2 × 1
##   name 
##   <chr>
## 1 David
## 2 Eve
# b. Anti join tells is about data that doesn't match up in both tables. Because the 
# customer table comes first, it will display the cusotmers where the customer_id does not
# show up in the order table. In this case customer_id 4 and 5 are not in orders table, so
# that is why we see David and Eve displayed
#c. 
head(Question6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

7. Practical Application (4 points)

# a. To find all customers, including those who haven't placed any orders, you
#    would use a left_join.  You would do this because it returns all rows
#    from the customers table and the matching rows from the orders table.
# b. To find only the customers who have placed orders, you would use an 
#    inner_join. An inner_join would be perfect here because, an inner_join
#    returns only rows which have a match in both the customers and orders 
#    data frames.
# c. 
PracticalApplication1 <- result_left_join <- customers %>%
  left_join(orders, by = "customer_id") 
PracticalApplication2 <- result_inner_join <- customers %>%
  inner_join(orders, by = "customer_id")
# d.
head(PracticalApplication1)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
head(PracticalApplication2)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

8. Challenge Question (3 points)

# Left join to ensure all customers are included
result <- customers %>%
  left_join(orders, by = "customer_id") %>%
# Group by customer_id, customer_name, and city
  group_by(customer_id, name, city) %>%
# Summarize: count number of orders and sum order amounts
  summarize(total_orders = n_distinct(order_id, na.rm = TRUE), 
            total_amount_spent = sum(amount, na.rm = TRUE),
            .groups = 'drop')