library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
There are 4 rows
Inner join returns where there is a match on both tables.
The customer data has only 3 customers in the order table where one
customer has 2 orders
head(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`
There are 6 rows
Left join returns all rows from the left table (customers), and the
matching rows from the right table (orders), rather than returning where
there is a match on both tables.
If there is no match, it fills in NA for the right table’s columns.
head(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers , orders)
## Joining with `by = join_by(customer_id)`
There are 6 rows
customer_id: 6 and customer_id: 7.
These customer_id values exist in the orders dataset but do not have
corresponding entries in the customers dataset.
head(q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers , orders)
## Joining with `by = join_by(customer_id)`
There are 8 rows
The rows with customer_id 4 and 5 show customers who did not place any orders, therefore providing no order information. The rows with customer_id 6 and 7 represent orders made by customers not listed in the customers dataset, causing the lack of customer information.
print(q4)
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
q5 <- semi_join(customers , orders)
## Joining with `by = join_by(customer_id)`
There are 3 rows
Semi join returns rows from the left table (customers) that have
matching values in the right table (orders). Only includes columns from
the left table and does not duplicate rows for multiple matches.
Where inner join Combines rows from both tables where there are matching
values in the specified columns. Returns all columns from both tables
for those matched rows.
Does not show duplicates.
print(q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers , orders)
## Joining with `by = join_by(customer_id)`
David (customer_id 4)
Eve (customer_id 5)
The main purpose of an anti join is to identify records in one
dataset that do not have corresponding records in another dataset.
David and Eve do not have any entries in the orders dataset, meaning
they have not placed any orders.
print(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
Left Join.
A left join returns all rows from the left table (customers), along with
matched rows from the right table (orders).
If there are no matches, it still returns the rows from the left table
with NA values for columns from the right table.
This way, you can see all customers, including those who haven’t placed
any orders.
Inner Join.
An inner join returns only the rows where there are matches between the
left table (customers) and the right table (orders).
This means it will only return customers who have placed at least one
order, excluding those without any orders.
q7a <- left_join(customers , orders)
## Joining with `by = join_by(customer_id)`
q7b <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
print(q7a)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
print(q7b)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
Challenge Question. Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders.
customer_summary <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(
total_orders = sum(!is.na(order_id)),
total_amount_spent = sum(amount, na.rm = TRUE)
) %>%
ungroup()
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
print(customer_summary)
## # A tibble: 5 × 5
## customer_id name city total_orders total_amount_spent
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 0 0
## 5 5 Eve Phoenix 0 0