library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)
########## Question 1

inner_join_result <- inner_join(customers, orders, by = "customer_id")
print(inner_join_result)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
nrow(inner_join_result)
## [1] 4
#a.) The result contains 4 rows.
#b.) Some of the customers or orders arent included in the result because an
#inner join only returns rows with matching values in both data sets. 
########### Question 2

left_join_result <- left_join(customers, orders, by = "customer_id")
print(left_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
nrow(left_join_result)
## [1] 6
#a.) The result contains 6 rows.
#b.) The left join includes all customers, even those without the matching orders,
#whereas in the inner join only includes the customers with matching orders. 
#Ex. David and Eve, who didn't place any orders, are included in the left join
#but where excluded from the inner join. 
################# Question 3

right_join_result <- right_join(customers, orders, by = "customer_id")
print(right_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
nrow(right_join_result)
## [1] 6
#a.) The result contains 6 rows.
#b.) Customer Ids 6 and 7 have NULL values because there are no matching record
#in the customers table for these IDS. 
############# Question 4

full_join_result <- full_join(customers, orders, by = "customer_id")
print(full_join_result)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
nrow(full_join_result)
## [1] 8
#a.) The result contains 8 rows.
#b.) Rows 5 and 6 have NA for order details because David and Eve didn't place
#any orders. Rows 7 and 8 have NA for customer details because customer IDs 6
#7 dont exist in the customers table
############### Question 5

semi_join_result <- semi_join(customers, orders, by = "customer_id")
print(semi_join_result)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
nrow(semi_join_result)
## [1] 3
#a.) The result contains 3 rows.
#b.) The semi join returns only customers rows with matching customer IDs, while
#the inner join includes columns from both customers and orders. 
############# Question 6

anti_join_result <- anti_join(customers, orders, by = "customer_id")
print(anti_join_result)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
nrow(anti_join_result)
## [1] 2
#a.) The customers in the result are David and Eve.
#b.) The result shows that David and Eve have not placed any orders, as the anti
#join returns only the rows from the customers table where there is no matching
#customer ID in the orders table.
############### Question 7

#a.) you would use a left join because it returns all rows from the customers
#table, even if there is no matching entry in the orders table. 
#b.) you would use a semi join because it returns only the rows from the custom-
#ers table where there is a matching entry in the orders table, without includin
#g the order details.

# Left join to find all customers, including those who haven’t placed any orders
all_customers_result <- left_join(customers, orders, by = "customer_id")
print(all_customers_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# Semi join to find only the customers who have placed orders
customers_with_orders_result <- semi_join(customers, orders, by = "customer_id")
print(customers_with_orders_result)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
########## Question 8

customer_orders <- left_join(customers, orders, by = "customer_id")

customer_summary <- customer_orders %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n(),
    total_amount = sum(amount, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
print(customer_summary)
## # A tibble: 5 × 5
## # Groups:   customer_id, name [5]
##   customer_id name    city        total_orders total_amount
##         <dbl> <chr>   <chr>              <int>        <dbl>
## 1           1 Alice   New York               1         1200
## 2           2 Bob     Los Angeles            2         2300
## 3           3 Charlie Chicago                1          300
## 4           4 David   Houston                1            0
## 5           5 Eve     Phoenix                1            0