library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
#A: How many rows are in the result? # 4 rows
q1 <- inner_join(customers, orders, by = "customer_id")
#B: Why are some customers or orders not included in the result? # There are no matches for them in the opposite table
#C: Display the dataset
head(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers, orders, by = "customer_id")
#A: How many rows are in the result? # 6 rows
#B Explain why the number differs from the inner join result # The left join returns all rows from left, and the ones that match on the right, unlike inner join which returns only the matching rows
#C: Display the data
head(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers, orders, by = "customer_id")
#A: How many rows are in the result? # 6 rows
#B: Which customer_ids in the result have NULL for customer name and city? Explain why. # 6 and 7 have NULL because they do not have a matching result in the customers table
#C: Display the data
head(q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers, orders, by = "customer_id")
#A: How many rows are in the result? # 8 rows
#B: Identify any rows where there’s information from only one table. Explain these results # Rows 5, 6, 7, and 8. We can see this because 5 and 6 have NA for the orders table data (order_id, product, amount), and rows 7 and 8 have NA for the customer table data (name, city)
#C: Display the data
head(q4)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q5 <- semi_join(customers, orders, by = "customer_id")
#A: How many rows are in the result? # 3 rows
#B: How does the result differ from the inner join? # It doesn’t join the order table because there are no matches, and due to this there is one less row in the set
#C: Display the data
head(q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers, orders, by = "customer_id")
#A: How many rows are in the result # 2 rows
#B: Explain what the results tells you about these customers # This result shows us that the customer data for David and Eve does not match in the customer and orders table, as we see their information in the customers table (customer_id 4 and 5), but they are not included in the order table, but it displays the information we have from them for the customers table
#C: Display the data
head(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
#A: which join would you use to find all customers, including those who haven’t made orders? # Left join, because left join gives us all data from the left table (customers), even the data that does not have a match with the order table
#B: Which join would you use to find only the customers who have placed orders? Why? # Inner join, because inner join returns all data from both tables where there are matches, and there are matches between the two tables when customers have placed orders
#C: Write the r code for both scenarios
q7_customerstotal <- left_join(customers, orders, by = "customer_id")
q7_customersorders <- inner_join(customers, orders, by = "customer_id")
#D: Display the result
head(q7_customersorders)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
head(q7_customerstotal)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
summary <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(num_orders = n_distinct(order_id), amount_spent = sum(amount, na.rm = TRUE))
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
print(summary)
## # A tibble: 5 × 5
## # Groups: customer_id, name [5]
## customer_id name city num_orders amount_spent
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 1 0
## 5 5 Eve Phoenix 1 0