library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

QUESTION 1: Inner Join (3 points)

a.) Perform an inner join between the customers and orders datasets.

q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.)How many rows are in the result?

nrow(q1)
## [1] 4

There are 4 rows in the result.

c.)Why are some customers or orders not included in the result?

Some customers or orders are not included in the result because there are no matches between the two datasets.

d.)Display the result.

head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

QUESTION 2: Left Join (3 points)

a.) Perform a left join with customers as the left table and orders as the right table.

q2 <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.) How many rows are in the result?

nrow(q2)
## [1] 6

There are 6 rows in the result

c.) Explain why this number differs from the inner join result.

This number is different from the inner join result because instead of combining the two datasets and only including matches between the two, a left join inputs all the rows and values from the left table, regardless if the right table has any matches.

d.) Display the result

head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

QUESTION 3: Right Join (3 points)

a.) Perform a right join with customers as the left table and orders as the right table.

q3 <- right_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.) How many rows are in the result?

nrow(q3)
## [1] 6

There are 6 rows in the result

c.) Which customer_ids in the result have NULL for customer name and city? Explain why.

Customer IDs 6 and 7 have NULL for customer name and city because with the order data set as the right table using a right join the display returns all rows from the right table but in the orders (left) table, there are no customer names or cities that correspond with those ID numbers which is why it returns with NULL.

d.) Display the result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

QUESTION 4: Full Join (3 points)

a.) Perform a full join between customers and orders.

q4 <- full_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.) How many rows are in the result?

nrow(q4)
## [1] 8

There are 8 rows in the result.

c.) Identify any rows where there’s information from only one table. Explain these results.

Rows 5,6,7, and 8 are rows that have information from only one table. These results stem from what a full join is, where it combines two datasets into one full data set, regardless of matching values.

d.) Display the result

head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

QUESTION 5: Semi Join (3 points)

a.) Perform a semi join with customers as the left table and orders as the right table.

q5 <- semi_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.) How many rows are in the result?

nrow(q5)
## [1] 3

There are 3 rows in the result.

c.) How does this result differ from the inner join result?

The result differs from the inner join result because the inner join combines both tables and inserts all rows including NULLs, while the semi-join only returns rows that have matches and entirely exclude all rows the would be paired with a NULL value.

d.) Display the result

head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

QUESTION 6: Anti Join (3 points)

a.) Perform an anti join with customers as the left table and orders as the right table.

q6 <- anti_join(customers , orders)
## Joining with `by = join_by(customer_id)`

b.) Which customers are in the result?

q6_pt1 <- q6 %>%
select(name)
head(q6_pt1)
## # A tibble: 2 × 1
##   name 
##   <chr>
## 1 David
## 2 Eve

David and Eve are the customers in the result.

c.) Explain what this result tells you about these customers.

This result tells me that these customers did not make any purchases.

d.) Display the result.

head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

QUESTION 7: Practical Application (4 points)

a.) Which join would you use to find all customers, including those who haven’t placed any orders? Why?

I would use a left join to find all customers, including those who haven’t place any orders because a left join with customers as the first table because it produces a data set with all customer names and order statuses are not the main focus

b.) Which join would you use to find only the customers who have placed orders? Why?

I would use an inner join to find only the customers who have placed orders because the inner join only returns matches and if we want customers and orders, for us to find customers who placed orders we require the matches in both datasets

c.) Write the R code for both scenarios.

q7_pt1 <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`
q7_pt2 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`

d.) Display the result.

head(q7_pt1)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
head(q7_pt2)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

QUESTION 8: Challenge Question (3 points).

Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

q8 <- customers %>%
  left_join(orders, by = "customer_id")
customer_summary <- q8 %>%
  group_by(name, city) %>%
  summarize(total_orders = n() , total_amount_spent = sum(amount, na.rm = TRUE))
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
head(customer_summary)
## # A tibble: 5 × 4
## # Groups:   name [5]
##   name    city        total_orders total_amount_spent
##   <chr>   <chr>              <int>              <dbl>
## 1 Alice   New York               1               1200
## 2 Bob     Los Angeles            2               2300
## 3 Charlie Chicago                1                300
## 4 David   Houston                1                  0
## 5 Eve     Phoenix                1                  0