library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Question 1

# Perform Inner Join
Joined_Table <- customers %>%
  inner_join(orders, by = "customer_id")

# Display the result
print(Joined_Table)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
# Count the number of rows in the result
num_rows <- nrow(Joined_Table)
cat("Number of rows in the inner join result:", num_rows, "\n")
## Number of rows in the inner join result: 4
# Explanation
#Some customers are not included because they do not have matching entries in the orders dataset.
#Similarly, some orders are not included because their customer_id does not exist in the customers dataset.
#An inner join only keeps rows where there is a match in both datasets.

Question 2

#Question 2

# Perform Left Join
left_join_result <- customers %>%
  left_join(orders, by = "customer_id")



# Count the number of rows in the result
num_rows_left_join <- nrow(left_join_result)
cat("Number of rows in the left join result:", num_rows_left_join, "\n")
## Number of rows in the left join result: 6
# Display the result
print(left_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# Explanation
#A left join keeps all rows from the customers table (left table), even if there is no matching entry in the orders table (right table).
#For customers who do not have orders, the order-related columns (order_id, product, amount) will have NA values.

Question 3

# Perform Right Join
right_join_result <- customers %>%
  right_join(orders, by = "customer_id")



# Count the number of rows in the result
num_rows_right_join <- nrow(right_join_result)
cat("Number of rows in the right join result:", num_rows_right_join, "\n")
## Number of rows in the right join result: 6
# Identify customer_ids with NULL (NA) for name and city
null_customers <- right_join_result %>%
  filter(is.na(name)) %>%
  select(customer_id)


# Count the number of rows in the result
num_rows_right_join <- nrow(right_join_result)
cat("Number of rows in the right join result:", num_rows_right_join, "\n")
## Number of rows in the right join result: 6
# Display the result
print(right_join_result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
# Explanation
#Number of rows is 6 
#A right join keeps all rows from the orders table (right table), even if there is no matching entry in the customers table (left table).
#Customer IDs 6 and 7 have NULL values for name and city because they exist in the orders dataset but do not have a corresponding entry in the customers dataset.

Question 4

# Perform Full Join
full_join_result <- customers %>%
  full_join(orders, by = "customer_id")



# Count the number of rows in the result
num_rows_full_join <- nrow(full_join_result)
cat("Number of rows in the full join result:", num_rows_full_join, "\n")
## Number of rows in the full join result: 8
#Number of rows is 8 


# Identify rows where information is from only one table
only_customers <- full_join_result %>% filter(is.na(order_id))  # Customers without orders
only_orders <- full_join_result %>% filter(is.na(name))  # Orders without customers


# Display the result
print(full_join_result)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
# Explanation
#A full join keeps all rows from both datasets, combining data where there is a match.
#Customers 4 and 5 appear in the result without order-related information (NA values) because they do not have orders.
#Orders with customer IDs 6 and 7 appear in the result with NA values for customer name and city because these customer IDs do not exist in the customers dataset.

Question 5

# Perform Semi Join
semi_join_result <- customers %>%
  semi_join(orders, by = "customer_id")



# Count the number of rows in the result
num_rows_semi_join <- nrow(semi_join_result)
cat("Number of rows in the semi join result:", num_rows_semi_join, "\n")
## Number of rows in the semi join result: 3
# Display the result
print(semi_join_result)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
# Explanation
#Number of rows in the semi join result: 3
#A semi join returns only rows from the customers table (left table) that have a match in the orders table (right table).
#Unlike an inner join, it does not include any columns from the orders table, only filtering the customers that have at least one matching order.
#Customers 4 and 5 are excluded because they do not have a corresponding order in the orders table.

Question 6

# Perform Anti Join
anti_join_result <- customers %>%
  anti_join(orders, by = "customer_id")


# Count the number of rows in the result
num_rows_anti_join <- nrow(anti_join_result)
cat("Number of rows in the anti join result:", num_rows_anti_join, "\n")
## Number of rows in the anti join result: 2
# Display the result
print(anti_join_result)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
# Explanation
#An anti join returns only rows from the customers table (left table) that have NO match in the orders table (right table).
#This means these customers do not have any associated orders in the orders dataset.
#Customers in the result are those who exist in the customers table but are missing from the orders table.

Question 7

#Question 7

#Scenario 1: Finding all customers, including those who haven’t placed any orders

all_customers <- customers %>%
  left_join(orders, by = "customer_id")


# Count rows
num_rows_all_customers <- nrow(all_customers)
cat("Number of rows in the result for all customers:", num_rows_all_customers, "\n")
## Number of rows in the result for all customers: 6
# Display the result
print(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# Explanation
#Number of rows in the result for all customers: 6
#A LEFT JOIN is used because it keeps all customers, including those who have NOT placed any orders.
#Customers 4 (David) and 5 (Eve) appear with NA values in the order columns since they have no orders.


#Scenario 2: Finding only customers who have placed orders


customers_with_orders <- customers %>%
  inner_join(orders, by = "customer_id")


# Count rows
num_rows_customers_with_orders <- nrow(customers_with_orders)
cat("Number of rows in the result for customers who placed orders:", num_rows_customers_with_orders, "\n")
## Number of rows in the result for customers who placed orders: 4
# Display the result
print(customers_with_orders)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
# Explanation
#Number of rows in the result for customers who placed orders: 4
#An INNER JOIN is used because it keeps only customers who have placed at least one order.
#Customers 4 (David) and 5 (Eve) are excluded since they have no matching orders in the orders dataset.

Question 8

# Perform Left Join to include all customers
customer_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%  # Keep all customers, even those without orders
  group_by(customer_id, name, city) %>%  # Group by customer details
  summarize(
    total_orders = n(),  # Count total orders per customer
    total_amount_spent = sum(amount, na.rm = TRUE)  # Sum total spending per customer
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
# Display the result
print(customer_summary)
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_amount_spent
##         <dbl> <chr>   <chr>              <int>              <dbl>
## 1           1 Alice   New York               1               1200
## 2           2 Bob     Los Angeles            2               2300
## 3           3 Charlie Chicago                1                300
## 4           4 David   Houston                1                  0
## 5           5 Eve     Phoenix                1                  0
# Explanation
#This summary includes all customers, even those who have not placed orders.
#Customers without orders will have 'total_orders' as 1 (due to left join keeping them) and 'total_amount_spent' as 0.