library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
There are 4 rows
Inner join returns where there is a match on both tables. The customer data has only three customers in the order table, where one customer has two orders
head(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers, orders, by = "customer_id")
There are 6 rows
This number differs because it wil return all rows not just rows with matching data
head(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers, orders, by = "customer_id")
There are 6 rows
There is a NULL in for rows 6 and 7, because there are only five customers, therefore there cannot be more customers than there are customer Ids. Two of the customers have ordered more than one item.
head(q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers, orders, by = "customer_id")
There are 8 rows
In rows four and five there is only data from the customers table and in rows seven and eight there is only data from the orders table. This is because they do not have overlapping data.
head(q4)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q5 <- semi_join(customers, orders, by = "customer_id")
There are 3 rows
In the semi join table there is only data from the customers data frame but instead of putting Bob twice he is only in there once. Bob has two orders, that’s why he was in there twice.
head(q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers, orders, by = "customer_id")
David and Eve are in the result
Those customers are not in the orders table, only in the customers table.
head(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
You would use left-join to find all customers even those who have not placed any orders. This is because it brings all elements of both tables together.
You would use Semi-join to find only customers who placed an order. If you wanted to see the customers and the order you would use inner-join. For inner join, it returns only elements that match therefore you would get only customers that have placed an order. Semi-join returns all elements that match on both tables therefore you will get a return of people who have placed orders, it will only return names though and not the orders.
all_customers <- merge(customers, orders, by = "customer_id", all.x = TRUE)
all_customers2 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
head(all_customers)
## customer_id name city order_id product amount
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
head(all_customers2)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
summary_df <- full_join(customers, orders, by = "customer_id")%>%
group_by(customer_id, city)%>%
summarise(total_orders = n(), total_amount = sum(amount, na.rm = TRUE), .groups = "drop")
head(summary_df)
## # A tibble: 6 × 4
## customer_id city total_orders total_amount
## <dbl> <chr> <int> <dbl>
## 1 1 New York 1 1200
## 2 2 Los Angeles 2 2300
## 3 3 Chicago 1 300
## 4 4 Houston 1 0
## 5 5 Phoenix 1 0
## 6 6 <NA> 1 600