library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
orders
## # A tibble: 6 × 4
## order_id customer_id product amount
## <dbl> <dbl> <chr> <dbl>
## 1 101 1 Laptop 1200
## 2 102 2 Phone 800
## 3 103 3 Tablet 300
## 4 104 2 Desktop 1500
## 5 105 6 Camera 600
## 6 106 7 Printer 150
inner_join_result <- inner_join(customers, orders, by = "customer_id")
inner_join_result
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
How Many rows are in the result? 4 customers
Why are some customers or orders not included in the result? Because if the customer_id do not match with both data sets it will not output in the results
left_join_result <- left_join(customers, orders, by = "customer_id")
left_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
How many rows are in result? 6 rows
Explain why this number differs from the inner join result? Because it outputs everything from the left of customer and inner join data set.
right_join_result <- right_join(customers, orders, by = "customer_id")
right_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
nrow(right_join_result)
## [1] 6
How many rows are in the result? 6 Rows
Which customer_ids in the result have NULL for customer name and city? Customer_ids 6 and 7 both have NULL output because their was never a customer name for Customer_ids so the data set outputted nothing (Null).
full_join_result <- full_join(customers, orders, by = "customer_id")
full_join_result
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
nrow(full_join_result)
## [1] 8
How many rows are in the result? 8 rows
Identify any rows where there’s information only one table? All the information from the order and customer data set will be in full join data set.
semi_join_result <- semi_join(customers, orders, by = "customer_id")
semi_join_result
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
nrow(semi_join_result)
## [1] 3
How many rows are in the result? 3 Rows
How does this result differ from the inner join result? Inner join has a another row for Bob which is not displayed in semi join. Because semi join returns only the rows from the first data frame (customers) that match the second data frame (orders).
anti_join_result <- anti_join(customers, orders, by = "customer_id")
anti_join_result
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
anti_join_result$name
## [1] "David" "Eve"
Which customers are in the result? David and Eve
Explain what the result tells you about the customers? David and Eva are both not connected to the order data set table.
customers_left_join <- customers %>%
left_join(orders, by = "customer_id")
customers_inner_join <- customers %>%
inner_join(orders, by = "customer_id")
print("All customers (including those without orders):")
## [1] "All customers (including those without orders):"
print(customers_left_join)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
print("Only customers who have placed orders:")
## [1] "Only customers who have placed orders:"
print(customers_inner_join)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
Which join would you use to find all customers, including those who haven’t placed any orders? Why? Use left_join to find all customers and customers who have not placed orders
which join would you use to find only the customers, including those who those who have placed orders? why? To find only the customers who have placed orders you would use inner join because inner join will filter out customers who have not placed any orders.
library(tidyr)
customer_orders_summary <- customers %>%
left_join(orders, by = "customer_id") %>%
replace_na(list(amount = 0)) %>%
group_by(customer_id,name, city) %>%
summarize(
total_orders = n(),
total_spent = sum(amount),
.groups = "drop"
)
customer_orders_summary
## # A tibble: 5 × 5
## customer_id name city total_orders total_spent
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 1 0
## 5 5 Eve Phoenix 1 0