q1 <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`
The resulting data has 4 rows
Because the customers and orders that are no included did not have a match in the other table
head(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers, orders, by = "customer_id")
The resulting data has 6 rows
The data is joined primarily from the data on the left, therefore, although some customer_id’s are not linked to an order_id they will still be included as they are collected from the left table first.
head(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers, orders, by = "customer_id")
The resulting data has 6 rows
It is a right join adn thesefore all order_id’s are inlcuded. some order_id’s do not link with an existing customer_id bust are still included in the table due to the type of join
head(q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`
The resulting data has 8 rows
customers_only <- q4 %>%
filter(is.na(order_id))
customers_only
## # A tibble: 2 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 4 David Houston NA <NA> NA
## 2 5 Eve Phoenix NA <NA> NA
orders_only <- q4 %>%
filter(is.na(name))
orders_only
## # A tibble: 2 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 6 <NA> <NA> 105 Camera 600
## 2 7 <NA> <NA> 106 Printer 150
full joins bring together all data despite if it has data relevent to both tables therefore all data in both customers and orders tables will be inlcuded
q5 <- semi_join(customers, orders, by = "customer_id")
The resulting data has 3 rows
semi join returns only rows from customers, it also only does not duplicate rows nor customers who have not ordered
head(q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers, orders, by = "customer_id")
q6 %>%
select(customer_id, name)
## # A tibble: 2 × 2
## customer_id name
## <dbl> <chr>
## 1 4 David
## 2 5 Eve
Anti join returns customers who do not have matching rown in orders
head(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
q7 <- left_join(customers, orders, by = "customer_id")
the left join keeps every customer in the dataset
q7b <- inner_join(customers, orders, by = "customer_id")
this shows only the real customers who have placed orders
head(q7)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
head(q7b)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
challenge_table <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(
total_orders = sum(!is.na(order_id)),
total_spent = sum(amount, na.rm = TRUE),
.groups = "drop"
)
head(challenge_table)
## # A tibble: 5 × 5
## customer_id name city total_orders total_spent
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 0 0
## 5 5 Eve Phoenix 0 0