library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Question 1. Inner Join (3 points) Perform an inner join between the customers and orders datasets.

q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`

a.) How many rows are in the result?

There are 4 rows

b.) Why are some customers or orders not included in the result?

Inner join returns where there is a match on both tables. The customer data has only three customers in the order table, where one customer has two orders

c.) Display the result

head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Question 2. Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

q2 <- left_join(customers, orders, by = "customer_id")

a. How many rows are in the result?

There are 6 rows

b. Explain why this number differs from the inner join result.

This number differs because it wil return all rows not just rows with matching data

c. Display the result

head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Question 3. Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

q3 <- right_join(customers, orders, by = "customer_id")

a. How many rows are in the result?

There are 6 rows

b. Which customer_ids in the result have NULL for customer name and city? Explain why.

There is a NULL in for rows 6 and 7, because there are only five customers, therefore there cannot be more customers than there are customer Ids. Two of the customers have ordered more than one item.

c. Display the result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Question 4. Full Join (3 points) Perform a full join between customers and orders.

q4 <- full_join(customers, orders, by = "customer_id")

a. How many rows are in the result?

There are 8 rows

b. Identify any rows where there’s information from only one table. Explain these results.

In rows four and five there is only data from the customers table and in rows seven and eight there is only data from the orders table. This is because they do not have overlapping data.

c. Display the result

head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Question 5. Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

q5 <- semi_join(customers, orders, by = "customer_id")

a. How many rows are in the result?

There are 3 rows

b. How does this result differ from the inner join result?

In the semi join table there is only data from the customers data frame but instead of putting Bob twice he is only in there once. Bob has two orders, that’s why he was in there twice.

c. Display the result

head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Questions 6. Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.

q6 <- anti_join(customers, orders, by = "customer_id")

a. Which customers are in the result?

David and Eve are in the result

b. Explain what this result tells you about these customers.

Those customers are not in the orders table, only in the customers table.

c. Display the result

head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Question 7. Practical Application (4 points) Imagine you’re analyzing customer behavior.

a. Which join would you use to find all customers, including those who haven’t placed any orders? Why?

You would use left-join to find all customers even those who have not placed any orders. This is because it brings all elements of both tables together.

b. Which join would you use to find only the customers who have placed orders? Why?

You would use Semi-join to find only customers who placed an order. If you wanted to see the customers and the order you would use inner-join. For inner join, it returns only elements that match therefore you would get only customers that have placed an order. Semi-join returns all elements that match on both tables therefore you will get a return of people who have placed orders, it will only return names though and not the orders.

c. Write the R code for both scenarios.

all_customers <- merge(customers, orders, by = "customer_id", all.x = TRUE)
all_customers2 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`

d. Display the result

head(all_customers)
##   customer_id    name        city order_id product amount
## 1           1   Alice    New York      101  Laptop   1200
## 2           2     Bob Los Angeles      102   Phone    800
## 3           2     Bob Los Angeles      104 Desktop   1500
## 4           3 Charlie     Chicago      103  Tablet    300
## 5           4   David     Houston       NA    <NA>     NA
## 6           5     Eve     Phoenix       NA    <NA>     NA
head(all_customers2)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Question 8. Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

summary_df <- full_join(customers, orders, by = "customer_id")%>%
  group_by(customer_id, city)%>%
  summarise(total_orders = n(), total_amount = sum(amount, na.rm = TRUE), .groups = "drop")

head(summary_df)
## # A tibble: 6 × 4
##   customer_id city        total_orders total_amount
##         <dbl> <chr>              <int>        <dbl>
## 1           1 New York               1         1200
## 2           2 Los Angeles            2         2300
## 3           3 Chicago                1          300
## 4           4 Houston                1            0
## 5           5 Phoenix                1            0
## 6           6 <NA>                   1          600