Dataset 1: Customers

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

Dataset 2: Orders

orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

orders 
## # A tibble: 6 × 4
##   order_id customer_id product amount
##      <dbl>       <dbl> <chr>    <dbl>
## 1      101           1 Laptop    1200
## 2      102           2 Phone      800
## 3      103           3 Tablet     300
## 4      104           2 Desktop   1500
## 5      105           6 Camera     600
## 6      106           7 Printer    150

Question 1: Inner Join

inner_join_result <- inner_join(customers, orders, by = "customer_id")

 inner_join_result 
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

How Many rows are in the result? 4 customers

Why are some customers or orders not included in the result? Because if the customer_id do not match with both data sets it will not output in the results

Question 2: Left Join

left_join_result <- left_join(customers, orders, by = "customer_id")

left_join_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

How many rows are in result? 6 rows

Explain why this number differs from the inner join result? Because it outputs everything from the left of customer and inner join data set.

Question 3: Right Join

right_join_result <- right_join(customers, orders, by = "customer_id")
right_join_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
nrow(right_join_result)
## [1] 6

How many rows are in the result? 6 Rows

Which customer_ids in the result have NULL for customer name and city? Customer_ids 6 and 7 both have NULL output because their was never a customer name for Customer_ids so the data set outputted nothing (Null).

Question 4: Full join

full_join_result <- full_join(customers, orders, by = "customer_id")

full_join_result
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
nrow(full_join_result)
## [1] 8

How many rows are in the result? 8 rows

Identify any rows where there’s information only one table? All the information from the order and customer data set will be in full join data set.

Question 5: Semi Join Perform

semi_join_result <- semi_join(customers, orders, by = "customer_id")

semi_join_result
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
nrow(semi_join_result)
## [1] 3

How many rows are in the result? 3 Rows

How does this result differ from the inner join result? Inner join has a another row for Bob which is not displayed in semi join. Because semi join returns only the rows from the first data frame (customers) that match the second data frame (orders).

Question 6: Anit Join

anti_join_result <- anti_join(customers, orders, by = "customer_id")

anti_join_result
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
anti_join_result$name 
## [1] "David" "Eve"

Which customers are in the result? David and Eve

Explain what the result tells you about the customers? David and Eva are both not connected to the order data set table.

Question 7: Partical Application

customers_left_join <- customers %>%
  left_join(orders, by = "customer_id")

customers_inner_join <- customers %>%
  inner_join(orders, by = "customer_id")

print("All customers (including those without orders):")
## [1] "All customers (including those without orders):"
print(customers_left_join)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
print("Only customers who have placed orders:")
## [1] "Only customers who have placed orders:"
print(customers_inner_join)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Which join would you use to find all customers, including those who haven’t placed any orders? Why? Use left_join to find all customers and customers who have not placed orders

which join would you use to find only the customers, including those who those who have placed orders? why? To find only the customers who have placed orders you would use inner join because inner join will filter out customers who have not placed any orders.

Question 8: Summary

library(tidyr)

customer_orders_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  
  replace_na(list(amount = 0)) %>%
  
  group_by(customer_id,name, city) %>%
  
  summarize(
    total_orders = n(),  
    total_spent = sum(amount),
    .groups = "drop"
  )

customer_orders_summary
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_spent
##         <dbl> <chr>   <chr>              <int>       <dbl>
## 1           1 Alice   New York               1        1200
## 2           2 Bob     Los Angeles            2        2300
## 3           3 Charlie Chicago                1         300
## 4           4 David   Houston                1           0
## 5           5 Eve     Phoenix                1           0