library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
How many rows are in the result?
nrow(q1)
## [1] 4
#Display the result
print(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
nrow(q2)
## [1] 6
print(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`
nrow(q3)
## [1] 6
NULL
(or NA
in
R) for both name
and city
are: Customer ID 6
and Customer ID 7. These customer IDs are present in the orders dataset
but do not exist in the customers dataset.Display the result
print (q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`
nrow(q4)
## [1] 8
print(q4)
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`
nrow(q5)
## [1] 3
customers
and
orders
where there is a matching customer_id
,
and it includes columns from both tables.#Display the result
print (q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`
print(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
Practical Application (4 points) Imagine you’re analyzing customer
behavior. # Which join would you use to find all customers, including
those who haven’t placed any orders? Why? ## To find all customers,
including those who haven’t placed any orders, the best join to use
would be a left join with the customers table as the left table and the
orders table as the right table. All rows from the left table customers,
regardless of whether they have a match in the right table orders.
Matching rows from the right table orders where applicable, and for
customers who haven’t placed any orders, the order-related columns will
show NA
(null) values. # Which join would you use to find
only the customers who have placed orders? Why? ## An inner join returns
only the rows that have matching values in both tables. In this case, it
will include only those customers who have placed orders. - This ensures
that you get a clear view of your active customers who are engaged with
the business by making purchases. #Write the R code for both
scenarios.
q_left <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
q_inner <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
Display the result
print(q_inner)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
print (q_left)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
customer_orders <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
customer_summary <- customer_orders %>%
group_by(name, city) %>%
summarize(
total_orders = n_distinct(order_id, na.rm = TRUE),
total_spent = sum(amount, na.rm = TRUE)
) %>%
ungroup()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
print(customer_summary)
## # A tibble: 5 × 4
## name city total_orders total_spent
## <chr> <chr> <int> <dbl>
## 1 Alice New York 1 1200
## 2 Bob Los Angeles 2 2300
## 3 Charlie Chicago 1 300
## 4 David Houston 0 0
## 5 Eve Phoenix 0 0