library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

# 1. Inner Join (3 points) Perform an inner join between the customers and orders datasets.
q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
#  How many rows are in the result?
nrow(q1)
## [1] 4
# Why are some customers or orders not included in the result?
  # Because the customers and orders that are not included did not have a match in the other table

#  Display the result
head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
# 2. Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.
q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# How many rows are in the result?
nrow(q2)
## [1] 6
# Explain why this number differs from the inner join result.
  # The left join keeps all customers, including those without orders. 
  # The inner join only keeps matching customer_id value from both tables.

# Display the result
head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# 3. Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.
q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# How many rows are in the result?
nrow(q3)
## [1] 6
# Which customer_ids in the result have NULL for customer name and city? Explain why.
q3 %>% 
  filter(is.na(name)) %>%
  select(customer_id)
## # A tibble: 2 × 1
##   customer_id
##         <dbl>
## 1           6
## 2           7
  # The right join keeps all rows from the orders table. 
  # customer_id 6 and 7 have NA for name and city because they exist in the orders table but not in customers table.

# Display the result
head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
# 4. Full Join (3 points) Perform a full join between customers and orders.
q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# How many rows are in the result?
nrow(q4)
## [1] 8
# Identify any rows where there’s information from only one table. Explain these results.
q4 %>%
  filter(is.na(order_id) | is.na(name))
## # A tibble: 4 × 6
##   customer_id name  city    order_id product amount
##         <dbl> <chr> <chr>      <dbl> <chr>    <dbl>
## 1           4 David Houston       NA <NA>        NA
## 2           5 Eve   Phoenix       NA <NA>        NA
## 3           6 <NA>  <NA>         105 Camera     600
## 4           7 <NA>  <NA>         106 Printer    150
  # The full join includes every customer and every order.
  # NA value appear where there is no matching customer_id in one of the tables.

# Display the result
head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# 5 Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.
q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# How many rows are in the result?
nrow(q5)
## [1] 3
# How does this result differ from the inner join result?
  # The semi join returns only customers who have placed orders. 
  # It differs from the inner join because it only shows customer information 
  # and does not repeat rows for mulitple orders.

# Display the result
head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
# 6 Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.
q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# Which customers are in the result?
q6
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
# Explain what this result tells you about these customers.
  # The anti join returns customers who do not have any matching orders.
  # These customers have not placed any orders.

# Display the result
head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
# 7 Practical Application (4 points) Imagine you’re analyzing customer behavior.

# Which join would you use to find all customers, including those who haven’t placed any orders? Why?
  # I would use a left join because it keeps all customers from the customers table, even if they do not have matching orders.

all_customers <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
head(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# Which join would you use to find only the customers who have placed orders? Why?
  # I would use a semi join because it returns only customers who have matching orders, wihtout repeating rows or including order details.

customers_with_orders <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`
head(customers_with_orders)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
# 8 Challenge Question

summary_table <- customers %>%
  left_join(orders) %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = sum(!is.na(order_id)),
    total_amount_spent = sum(amount, na.rm = TRUE), 
    .groups = "drop"
  )
## Joining with `by = join_by(customer_id)`
summary_table
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_amount_spent
##         <dbl> <chr>   <chr>              <int>              <dbl>
## 1           1 Alice   New York               1               1200
## 2           2 Bob     Los Angeles            2               2300
## 3           3 Charlie Chicago                1                300
## 4           4 David   Houston                0                  0
## 5           5 Eve     Phoenix                0                  0