library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

1. Inner Join (3 points) Perform an inner join between the customers and orders datasets

q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. How many rows are in the result?

The data has 4 rows

b. Why are some customers or orders not included in the result?

Inner joins return where there is a match on both tables. Both tables only have in common customers 1, 2, and 3, where one customer has 2 orders, which is why we have 4.

c. Display the result

head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

2. Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. How many rows are in the result?

The data has 6 rows.

b. Explain why this number differs from the inner join result.

Left join includes all rows from the left table and matching rows from the right table. Since there was a matching row of data for the customer_IDs for every customer_ID that matched from the orders table to the customers table, it would also show that customers product, amount, and order_id with all the information from the customers table as well. For the two people without an order, it would show N/A in those columns, as they didn’t have a match. It also shows two twice, signifying that customer two ordered two things.

c. Display the result

head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

3. Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. How many rows are in the result?

This data has 6 rows.

b. Which customer_ids in the result have NULL for customer name and city? Explain why.

Customer_ids 6 and 7 have null results for customer name and city because right joins return all rows from the right table and matching rows from the left. Since they are joined by customer_id, customer 1,2,3 from the customer table have orders placed, while 6 and 7 from the orders table do not have a row in the customers table.

c. Display the result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

4. Full Join (3 points) Perform a full join between customers and orders.

q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. How many rows are in the result?

This data has 8 rows.

b. Identify any rows where there’s information from only one table. Explain these results.

For customer_id 4 and 5 there is on information from the customers table and for customer_id 6 and 7 it prints from the orders table. This is because with a full join it prints the two tables together. Since it is joined by customer_id it is going to print all of them 1 through 7, and null (N/A) any data that is missing from one of the two tables.

c. Display the result

head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

5. Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. How many rows are in the result?

This data has 3 rows.

b. How does this result differ from the inner join result?

Semi joins purpose is to return all rows from the left (customers) table where there is a match from the right (orders) table. This result is different from the inner join because the inner join returns where there is a match based on a common variable for both tables where the semi join only returns the table to the left, which in this case is customers. Therefore customer I_d 1, 2, and 3 are shown with the name and city because both tables have customer_Id 1, 2, and 3.

c. Display the result

head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

6. Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.

q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a. Which customers are in the result?

Customer_Id 4 and 5, or David and Eve.

b. Explain what this result tells you about these customers.

David and Eve both are in the left table, but not the right table. The anti-joins return all rows from the left that are not in the right. They don’t have orders in the order table.

c. Display the result

head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

7. Practical Application (4 points) Imagine you’re analyzing customer behavior.

a. Which join would you use to find all customers, including those who haven’t placed any orders? Why?

Full join because that would give me all the customer_ids from each table, all the customers, including those who haven’t placed any orders yet.

b. Which join would you use to find only the customers who have placed orders? Why?

Right join because that shows all the orders based on customer_id and the corresponding customers that ordered it.

c. Write the R code for both scenarios.

q7a <- full_join(customers, orders) 
## Joining with `by = join_by(customer_id)`
q7b <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`

d. Display the result

head(q7a)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
head(q7b)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

8. Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

q8 <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(name, city) %>%
  summarise(total_orders = n(),
            total_amount = sum(amount, na.rm = TRUE),
            .groups = 'drop')
head(q8)
## # A tibble: 5 × 4
##   name    city        total_orders total_amount
##   <chr>   <chr>              <int>        <dbl>
## 1 Alice   New York               1         1200
## 2 Bob     Los Angeles            2         2300
## 3 Charlie Chicago                1          300
## 4 David   Houston                1            0
## 5 Eve     Phoenix                1            0