library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Task 1 - Perform inner join

inner_join_result <- customers %>%
  inner_join(orders, by = "customer_id")
inner_join_result
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
nrow(inner_join_result)
## [1] 4

How many rows are in the result?
The result will have 4 rows.

#Why are some customers or orders not included in the result?
#Customers 4 and 5 are not included because they don't have any orders in the orders table. Orders with customer_id 6 and 7 are not included because these customers do not exist in the customers table.
# Task 2 - Perform left join
left_join_result <- customers %>%
  left_join(orders, by = "customer_id")
left_join_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
nrow(left_join_result)
## [1] 6
#How many rows are in the result?
#The result will have 5 rows (all customers are included).

#Why does this number differ from the inner join result?
#Left join includes all rows from the customers table, while the inner join only includes rows with matching customer_id values in both tables.
# Task 3 - Perform right join
right_join_result <- customers %>%
  right_join(orders, by = "customer_id")
right_join_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
nrow(right_join_result)
## [1] 6
#How many rows are in the result?
#The result will have 6 rows (all orders are included).

#Which customer_ids in the result have NULL for customer name and city?
#customer_ids 6 and 7 have NULL values because these customers do not exist in the customers table.
# Task 4 - Perform full join
full_join_result <- customers %>%
  full_join(orders, by = "customer_id")
full_join_result
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
nrow(full_join_result)
## [1] 8
#How many rows are in the result?
#The result will have 7 rows (all customers and all orders are included).

#Identify any rows where there’s information from only one table.
#Rows with customer_id 4 and 5 contain information only from the customers table (no matching orders).
#Rows with customer_id 6 and 7 contain information only from the orders table (no matching customers).
# Task 5 - Perform semi join
semi_join_result <- customers %>%
  semi_join(orders, by = "customer_id")
semi_join_result
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
nrow(semi_join_result)
## [1] 3
#How many rows are in the result?
#The result will have 3 rows.

#How does this result differ from the inner join result?
#The semi join only returns the matching rows from the customers table, while the inner join returns columns from both tables.
# Task 6 -Perform anti join
anti_join_result <- customers %>%
  anti_join(orders, by = "customer_id")
anti_join_result
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
#Which customers are in the result?
#The customers in the result are David (customer 4) and Eve (customer 5).

#What does this result tell you about these customers?
#These customers have not placed any orders.
# Task 7 - 
#Havent placed orders
#Use left join for this scenario
all_customers <- customers %>%
  left_join(orders, by = "customer_id")
all_customers
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
#Have placed any orders
# Use inner join for this scenario
customers_with_orders <- customers %>%
  inner_join(orders, by = "customer_id")
customers_with_orders
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
# Task 8 - Perform left join and then summarize
customer_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(name, city) %>%
  summarize(
    total_orders = n(),
    total_amount_spent = sum(amount, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
customer_summary
## # A tibble: 5 × 4
## # Groups:   name [5]
##   name    city        total_orders total_amount_spent
##   <chr>   <chr>              <int>              <dbl>
## 1 Alice   New York               1               1200
## 2 Bob     Los Angeles            2               2300
## 3 Charlie Chicago                1                300
## 4 David   Houston                1                  0
## 5 Eve     Phoenix                1                  0