library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150))
7. Practical Application (4 points) Imagine you’re analyzing
customer behavior.
a. Which join would you use to find all customers, including those
who haven’t placed any orders? Why?
To find all customers, including those who haven’t placed any
orders, you would use a left join. This is because it will include all
records from the customers table and add matching records from the
orders tables. The result will still include customers who have not
placed an order with a n/a for their order.
b. Which join would you use to find only the customers who have
placed orders? Why?
To find only the customers who have placed orders, you would use an
inner join. This is because it retrieves only the rows that have
matching values in both tables.This means only those customers who have
corresponding records in the orders table. Using inner join will show
only customers who have made purchases.
c. Write the R code for both scenarios.
Left Join: all_customers <- customers %>%left_join(orders, by
= “customer_id”)
Inner Join: active_customers_orders <- inner_join(customers,
orders, by = “customer_id”)
d. Display the result
# Left Join: All customers including those without orders
all_customers <- customers %>%
left_join(orders, by = "customer_id")
# Print the result
print(all_customers)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
# Inner Join: Only customers who have placed orders
active_customers <- customers %>%
inner_join(orders, by = "customer_id")
# Print the result
print(active_customers)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
8. Challenge Question (3 points) Create a summary that shows each
customer’s name, city, total number of orders, and total amount spent.
Include all customers, even those without orders. Hint: You’ll need to
use a combination of joins and group_by/summarize operations.
# Create the summary with a full join
summary_data <- customers %>%
full_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarise(
total_orders = sum(!is.na(order_id)), # Count of orders using !is.na
total_amount = sum(amount, na.rm = TRUE) # Sum of amounts, ignoring NAs
) %>%
ungroup() %>%
mutate(
total_orders = ifelse(is.na(total_orders), 0, total_orders), # Replace NA with 0
total_amount = ifelse(is.na(total_amount), 0, total_amount) # Replace NA with 0
)
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
# Print the summary
print(summary_data)
## # A tibble: 7 × 5
## customer_id name city total_orders total_amount
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 0 0
## 5 5 Eve Phoenix 0 0
## 6 6 <NA> <NA> 1 600
## 7 7 <NA> <NA> 1 150