library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Question 1

Perform the inner join

inner_join_result <- inner_join(customers, orders, by = "customer_id")
  1. Count the number of rows
nrow(inner_join_result)
## [1] 4
  1. Why are some customers or orders not included in the result? Inner joins only return rows with matching keys in both datasets.
  2. Display the inner join result
print(inner_join_result)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Question 2

Perform the left join

left_join_result <- left_join(customers, orders, by = "customer_id")
  1. Count the number of rows
nrow(left_join_result)
## [1] 6
  1. Why does this number differ from inner join result? The inner join only returns rows where there is a match in both customers and orders. As a result, customers without orders (e.g., customer_id = 4, 5) were excluded from the inner join.

  2. Display the result

print(left_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Question 3

Perform the Right Join

right_join_result <- right_join(customers, orders, by = "customer_id")
  1. Count the number of rows
nrow(right_join_result)
## [1] 6

b.Which customer_ids in the result have NULL for customer name and city? 6 and 7 have NA for name and city because in a right join, all records from the orders table are included, even if there is no matching customer_id in the customers table. When a match is not found, the values for columns from the customers table (like name and city) will be NA.

  1. Display the result
print(right_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Question 4

Preform a full join

full_join_result <- full_join(customers, orders, by = "customer_id")
  1. Count the number of rows
nrow(full_join_result)
## [1] 8
  1. Identify any rows where there’s information from only one table. Customer_id 4 and customer_id 5 have data in the customers table but NA for order_id, product, and amount because they haven’t placed any orders. Customer_id 6 and customer_id 7 have data in the orders table but NA for name and city because these customer IDs do not exist in the customers table.

  2. Display results

print(full_join_result)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150

Question 5

Perform a semi join

semi_join_result <- semi_join(customers, orders, by = "customer_id")
  1. Count number of rows
row(semi_join_result)
##      [,1] [,2] [,3]
## [1,]    1    1    1
## [2,]    2    2    2
## [3,]    3    3    3
  1. How does this result differ from the inner join result? The inner join includes rows from both customers and orders tables, combining matching columns from both. The semi join only returns the rows from customers that have a match in orders, but it doesn’t include any columns from orders

  2. Display result

print(semi_join_result)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Question 6

Perform the anti join

anti_join_result <- anti_join(customers, orders, by = "customer_id")
  1. Which Costumers are in the result? Customers 4 and 5 are in the result.

  2. Explain what this result tells you about these customer These customers are present in the customers dataset but haven’t placed any orders in the orders dataset. David and Eve are registered customers but have not made any purchases yet.

  3. Display result

print(anti_join_result)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Question 7

  1. Which join would you use to find all customers, including those who haven’t placed any orders? Left join to find all customers including those without orders.It ensures that you get all customers, regardless of whether they have placed any orders or not.
all_customers_result <- left_join(customers, orders, by = "customer_id")

print(all_customers_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
  1. Which join would you use to find only the customers who have placed orders? Inner join because it ensures that only customers who have placed orders are included in the result, since it requires a match in both tables.
customers_with_orders_result <- inner_join(customers, orders, by = "customer_id")

print(customers_with_orders_result)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300