library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

QUESTION 1: Perform an inner join between the customers and orders datasets.

q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

There are four rows in the result

Why are some customers or orders not included in the result?

These two tables were joined by customer id and only three of their ids match up so the rest were taken out.

Display the result

head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

QUESTION 2 Perform a left join with customers as the left table and orders as the right table.

q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

There are six rows in this result

Explain why this number differs from the inner join result.

The left join includes all customers regardless of whether they have placed any orders.

Display the result

head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

QUESTION 3 Perform a right join with customers as the left table and orders as the right table.

q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

There are six rows in this result

Which customer_ids in the result have NULL for customer name and city? Explain why.

customer IDs 6 and 7 will have NULL values for the name and city columns. This occurs because these customer IDs are not present in the customers dataset

Display the result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

QUESTION 4 Perform a full join between customers and orders.

q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

There are eight rows in this result

Identify any rows where there’s information from only one table. Explain these results.

five six seven and eight all have missing information. This is because it has customers who haven’t made an order and orders with no existent customers.

Display the result

head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

QUESTION 5 Perform a semi join with customers as the left table and orders as the right table.

q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`

How many rows are in the result?

There are only three rows in this result

How does this result differ from the inner join result?

While both the semi join and the inner join include only those customers who have orders, the difference is that the semi join returns only the columns from the customers dataset. The inner join returns columns from both datasets.

Display the result

head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

QUESRION 6 Perform an anti join with customers as the left table and orders as the right table.

q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`

Which customers are in the result?

The result will show customers from the customers dataset who do not have any matching customer id in the orders dataset.

Explain what this result tells you about these customers.

These results shows that they may be new customers who have not yet made a purchase.

Display the result

head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

QUESTION 7 Which join would you use to find all customers, including those who haven’t placed any orders? Why?

A left join allows you to include all customers from the customers dataset, regardless of whether they have placed any orders.

Which join would you use to find only the customers who have placed orders? Why?

An inner join will return only those customers who have placed at least one order.

Write the R code for both scenarios.

q7_pt1 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
q7_pt2 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`

Display the result

head(q7_pt1)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
head(q7_pt2)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

QUESTION 8 Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders.

summary_data <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n(),                     
    total_spent = sum(amount, na.rm = TRUE) 
  ) %>%
  ungroup()                                 
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
print(summary_data)
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_spent
##         <dbl> <chr>   <chr>              <int>       <dbl>
## 1           1 Alice   New York               1        1200
## 2           2 Bob     Los Angeles            2        2300
## 3           3 Charlie Chicago                1         300
## 4           4 David   Houston                1           0
## 5           5 Eve     Phoenix                1           0