library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Dataset 1: Customers

customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

Dataset 2: Orders

orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Inner Join (3 points) Perform an inner join between the customers and orders datasets.

q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

nrow(q1)
## [1] 4

Why are some customers or orders not included in the result?

Customers 4 and 5 are excluded because they dont have any matching orders. Orders for customer IDs 6 and 7 are excluded because these customers are not found in the customers dataset

#Display the result

print(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

nrow(q2)
## [1] 6

Explain why this number differs from the inner join result.

the left join includes all customers (even those without orders), while the inner join only includes customers who have placed orders. This is why the number of rows in the left join result is typically greater.

Display the result

print(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

nrow(q3)
## [1] 6

Which customer_ids in the result have NULL for customer name and city? Explain why.

The customer IDs that have NULL (or NA in R) for both name and city are: Customer ID 6 and Customer ID 7. These customer IDs are present in the orders dataset but do not exist in the customers dataset.

Display the result

print (q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Full Join (3 points) Perform a full join between customers and orders.

q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

nrow(q4)
## [1] 8

Identify any rows where there’s information from only one table. Explain these results.

Rows with information only from the customers table Customer IDs David and Eve have no corresponding order, so the order_id, product, and amount columns are NA. Rows with information only from the orders table: - Customer IDs 6 and 7 have no matching customer information, so the name and city columns are `NA

Display the result

print(q4)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150

Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

nrow(q5)
## [1] 3

How does this result differ from the inner join result?

Semi Join only includes rows from the customers table where there is a matching customer_id in the orders table, but does not include any data from the orders table. Only customer information is returned. Inner Join Includes rows from both customers and orders where there is a matching customer_id, and it includes columns from both tables.

#Display the result

print (q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.

q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`

Which customers are in the result?

Customer IDs in `results 1, 2, 3, 4, 5. Customer IDs in orders 1, 2, 3, 2, 6, 7. Since customer IDs 4 and 5 are in the customers table but do not appear in the orders table.

Explain what this result tells you about these customers.

customers with IDs 4 and 5 are listed in the customers table, meaning they are registered customers, but they **have not placed any orders in the orders table.

Display the result

print(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Practical Application (4 points) Imagine you’re analyzing customer behavior. # Which join would you use to find all customers, including those who haven’t placed any orders? Why? ## To find all customers, including those who haven’t placed any orders, the best join to use would be a left join with the customers table as the left table and the orders table as the right table. All rows from the left table customers, regardless of whether they have a match in the right table orders. Matching rows from the right table orders where applicable, and for customers who haven’t placed any orders, the order-related columns will show NA (null) values. # Which join would you use to find only the customers who have placed orders? Why? ## An inner join returns only the rows that have matching values in both tables. In this case, it will include only those customers who have placed orders. - This ensures that you get a clear view of your active customers who are engaged with the business by making purchases. #Write the R code for both scenarios.

q_left <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
q_inner <- inner_join(customers, orders)  
## Joining with `by = join_by(customer_id)`

Display the result

print(q_inner)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
print (q_left)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

customer_orders <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
customer_summary <- customer_orders %>%
  group_by(name, city) %>%
summarize(
    total_orders = n_distinct(order_id, na.rm = TRUE),  
    total_spent = sum(amount, na.rm = TRUE) 
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
print(customer_summary)
## # A tibble: 5 × 4
##   name    city        total_orders total_spent
##   <chr>   <chr>              <int>       <dbl>
## 1 Alice   New York               1        1200
## 2 Bob     Los Angeles            2        2300
## 3 Charlie Chicago                1         300
## 4 David   Houston                0           0
## 5 Eve     Phoenix                0           0