R Markdown

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

#1. Inner Join (3 points) Perform an inner join between the customers and orders datasets.

#A. How many rows are in the result? #b. Why are some customers or orders not included in the result? #c. Display the result

#a
q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
#b
nrow(q1)
## [1] 4
#c
head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

#2. a.Left Join (3 points) Perform a left join with customers as the left table and orders as the right table. #b.How many rows are in the result? #c.Explain why this number differs from the inner join result. #d.Display the result

q2 <- left_join(customers, orders, by = "customer_id")
nrow(q2)
## [1] 6
#Left join includes all the rows from the customers table even if it doenst match with the orders table
head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

#3.Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

#How many rows are in the result? #Which customer_ids in the result have NULL for customer name and city? Explain why. #Display the result

q3 <- right_join(customers, orders, by = "customer_id")
#a
nrow(q3)
## [1] 6
#b. rows 5 and 6 (customer_id 6 and 7) do not display because while they exist in the orders table where the join starts from, they didnt exist in the customers
#c
head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
#4.Full Join (3 points) Perform a full join between customers and orders.
q4 <- full_join(customers, orders, by = "customer_id")
#How many rows are in the result?
nrow(q4)
## [1] 8
 #Identify any rows where there’s information from only one table. Explain these results.
#Rows 5,6,7,8. A full join joins the rows of both tables reguardless of missing information from one table
#Display the result
head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
#Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.
q5 <- semi_join(customers, orders, by = "customer_id")
#How many rows are in the result?
nrow(q5)
## [1] 3
 # How does this result differ from the inner join result?
#semi join only includes the rows from customers table that are matched by orders table.
  #Display the result
head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
#Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.
q6 <- anti_join(customers, orders, by = "customer_id")

#Which customers are in the result?
#david and eve
 # Explain what this result tells you about these customers.
#both customers dont have data in the orders table
#Display the result
head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
#Practical Application (4 points) Imagine you’re analyzing customer behavior.

#Which join would you use to find all customers, including those who haven’t placed any orders? Why?
q7a <- left_join(customers, orders, by = "customer_id")
head(q7a)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
 # Which join would you use to find only the customers who have placed orders? Why?
q7b <- inner_join(customers, orders, by = "customer_id")
head(q7b)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
 #Write the R code for both scenarios.

#Display the result

#Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.
q8a <- left_join(customers, orders, by = "customer_id")

q8b <-q8a %>%
  group_by(customer_id, name, city)