library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)
  1. Perform an inner join between the customers and orders datasets.
q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. How many rows are in the result?

There are 4 rows

  1. Why are some customers or orders not included in the result?

Inner join returns where there is a match on both tables.
The customer data has only 3 customers in the order table where one customer has 2 orders

  1. Display the result
head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
  1. Perform a left join with customers as the left table and orders as the right table.
q2 <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. How many rows are in the result?

There are 6 rows

  1. Explain why this number differs from the inner join result.

Left join returns all rows from the left table (customers), and the matching rows from the right table (orders), rather than returning where there is a match on both tables.
If there is no match, it fills in NA for the right table’s columns.

  1. Display the result
head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
  1. Perform a right join with customers as the left table and orders as the right table.
q3 <- right_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. How many rows are in the result?

There are 6 rows

  1. Which customer_ids in the result have NULL for customer name and city? Explain why.

customer_id: 6 and customer_id: 7.
These customer_id values exist in the orders dataset but do not have corresponding entries in the customers dataset.

  1. Display the result
head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
  1. Perform a full join between customers and orders.
q4 <- full_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. How many rows are in the result?

There are 8 rows

  1. Identify any rows where there’s information from only one table. Explain these results.

The rows with customer_id 4 and 5 show customers who did not place any orders, therefore providing no order information. The rows with customer_id 6 and 7 represent orders made by customers not listed in the customers dataset, causing the lack of customer information.

  1. Display the result
print(q4)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
  1. Perform a semi join with customers as the left table and orders as the right table.
q5 <- semi_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. How many rows are in the result?

There are 3 rows

  1. How does this result differ from the inner join result?

Semi join returns rows from the left table (customers) that have matching values in the right table (orders). Only includes columns from the left table and does not duplicate rows for multiple matches.
Where inner join Combines rows from both tables where there are matching values in the specified columns. Returns all columns from both tables for those matched rows.
Does not show duplicates.

  1. Display the result
print(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
  1. Perform an anti join with customers as the left table and orders as the right table.
q6 <- anti_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. Which customers are in the result?

David (customer_id 4)
Eve (customer_id 5)

  1. Explain what this result tells you about these customers.

The main purpose of an anti join is to identify records in one dataset that do not have corresponding records in another dataset.
David and Eve do not have any entries in the orders dataset, meaning they have not placed any orders.

  1. Display the result
print(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
  1. Practical Application (4 points) Imagine you’re analyzing customer behavior.
  1. Which join would you use to find all customers, including those who haven’t placed any orders? Why?

Left Join.
A left join returns all rows from the left table (customers), along with matched rows from the right table (orders).
If there are no matches, it still returns the rows from the left table with NA values for columns from the right table.
This way, you can see all customers, including those who haven’t placed any orders.

  1. Which join would you use to find only the customers who have placed orders? Why?

Inner Join.
An inner join returns only the rows where there are matches between the left table (customers) and the right table (orders).
This means it will only return customers who have placed at least one order, excluding those without any orders.

  1. Write the R code for both scenarios.
q7a <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`
q7b <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
  1. Display the result
print(q7a)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
print(q7b)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Challenge Question. Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders.

customer_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = sum(!is.na(order_id)),       
    total_amount_spent = sum(amount, na.rm = TRUE)  
  ) %>%
  ungroup()  
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
print(customer_summary)
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_amount_spent
##         <dbl> <chr>   <chr>              <int>              <dbl>
## 1           1 Alice   New York               1               1200
## 2           2 Bob     Los Angeles            2               2300
## 3           3 Charlie Chicago                1                300
## 4           4 David   Houston                0                  0
## 5           5 Eve     Phoenix                0                  0