library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150))

1. Inner Join (3 points) Perform an inner join between the customers and orders datasets.

a. How many rows are in the result?

There are 4 Rows in the result

b. Why are some customers or orders not included in the result?

Interjoin only returns matched rows. The customers or orders that are not included in the result are not there because they do not have a match.

c.Display the result

 inner_join(customers, orders, by = "customer_id")
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

2. Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

a. How many rows are in the result?

There are 6 rows in this result

b.Explain why this number differs from the inner join result.

This number differs from the inter join result because left join also includes all left rows with matched right rows.

c.Display the result

left_join(customers, orders, by = "customer_id")
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

3.Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

a. How many rows are in the result?

There are 6 rows in the result

b.Which customer_ids in the result have NULL for customer name and city? Explain why.

David Houston and Eve Phoneix ids have null for customer name and city because they do not match, but all right rows are included in right join.

c.Display the result

right_join(customers, orders, by = "customer_id")
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

4. Full Join (3 points) Perform a full join between customers and orders.

a.How many rows are in the result?

There are 8 rows in this result

b.Identify any rows where there’s information from only one table. Explain these results.

Rows with information from only the orders table:Customer ID 6 and ID 7 are NULL for name and city. This means that customer with ID 6 and ID 7 have placed orders, but do not exist in the customers table.For rows with information from only the customers table, Customer ID 4 and ID 5 are null for order_id, product, and amount. This means that customers 4 and 5 exist in the customers table but have not placed any orders.

c.Display the result

full_join(customers, orders, by = "customer_id")
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150

5.Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

a. How many rows are in the result?

There are 3 rows in the result

b. How does this result differ from the inner join result?

The semi join returns only the distinct customer records from the customers table that have placed orders, without including any details from the orders table. Inner join returns detailed records that include both customer information and the corresponding order details. Semi join focuses solely on the presence of customers with orders, the inner join provides a fuller dataset, showing each customer alongside their associated orders.

c. Display the result

semi_join(customers, orders, by = "customer_id")
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

6.Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.

a. Which customers are in the result?

David and Eve

b. Explain what this result tells you about these customers.

David and Eve are customers who have not placed any orders. They are present in the customers table but do not have any matches in the orders table. This means that they might be inactive customers or have not made any purchases.

c. Display the result

anti_join(customers, orders, by = "customer_id")
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

7. Practical Application (4 points) Imagine you’re analyzing customer behavior.

a. Which join would you use to find all customers, including those who haven’t placed any orders? Why?

To find all customers, including those who haven’t placed any orders, you would use a left join. This is because it will include all records from the customers table and add matching records from the orders tables. The result will still include customers who have not placed an order with a n/a for their order.

b. Which join would you use to find only the customers who have placed orders? Why?

To find only the customers who have placed orders, you would use an inner join. This is because it retrieves only the rows that have matching values in both tables.This means only those customers who have corresponding records in the orders table. Using inner join will show only customers who have made purchases.

c. Write the R code for both scenarios.

Left Join: all_customers <- customers %>%left_join(orders, by = “customer_id”)
Inner Join: active_customers_orders <- inner_join(customers, orders, by = “customer_id”)

d. Display the result

# Left Join: All customers including those without orders
all_customers <- customers %>%
  left_join(orders, by = "customer_id")

# Print the result
print(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# Inner Join: Only customers who have placed orders
active_customers <- customers %>%
  inner_join(orders, by = "customer_id")

# Print the result
print(active_customers)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

8. Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

# Create the summary with a full join
summary_data <- customers %>%
  full_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarise(
    total_orders = sum(!is.na(order_id)),      # Count of orders using !is.na
    total_amount = sum(amount, na.rm = TRUE)   # Sum of amounts, ignoring NAs
  ) %>%
  ungroup() %>%
  mutate(
    total_orders = ifelse(is.na(total_orders), 0, total_orders),  # Replace NA with 0
    total_amount = ifelse(is.na(total_amount), 0, total_amount)   # Replace NA with 0
  )
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
# Print the summary
print(summary_data)
## # A tibble: 7 × 5
##   customer_id name    city        total_orders total_amount
##         <dbl> <chr>   <chr>              <int>        <dbl>
## 1           1 Alice   New York               1         1200
## 2           2 Bob     Los Angeles            2         2300
## 3           3 Charlie Chicago                1          300
## 4           4 David   Houston                0            0
## 5           5 Eve     Phoenix                0            0
## 6           6 <NA>    <NA>                   1          600
## 7           7 <NA>    <NA>                   1          150