### Assignment 4
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix"))

# Dataset 2: Orders
orders <- tibble(order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150))
##1 
#inner_join(customers, orders, by = "customer_id")
##1a
# THere are four rows in the result
##1b
# Some orders/customers aren't included in the result because on the orders table there are customer Ids that don't appear on the customers table
# 1c
inner_join(customers, orders, by = "customer_id")
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 2
#left_join(customers, orders, by = "customer_id")
## 2a
# There are 6 rows
## 2b
# This number differs from the inner join because when using left join it returns all rows from the left table while inner join returns only rows that match
## 2c
left_join(customers, orders, by = "customer_id")
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 3
#right_join(customers, orders, by = "customer_id")
## 3a
# There are 6 rows
## 3b
# Customer IDs 6 and 7 do, this is because the number is in the orders table, but there is no customer id 6 or 7 in the customers table
## 3c
right_join(customers, orders, by = "customer_id")
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
## 4
#full_join(customers, orders, by = "customer_id")
## 4a
# there are 8 rows
## 4b
# Rows 5-6 are missing data from the orders table and rows 7-8 are missing data from the customers table. This is because full join gives all rows from each table
## 4c
full_join(customers, orders, by = "customer_id")
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
## 5
#semi_join(customers, orders, by = "customer_id")
## 5a
# There are 3 rows in the result
## 5b
# this is different from the inner join because semi join only uses rows from the left table but only uses the rows where there is a match with the right
## 5c
semi_join(customers, orders, by = "customer_id")
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
## 6
#anti_join(customers, orders, by = "customer_id")
## 6a
# David and Eve are the two customers in the result
## 6b
# this tells me that David and Eve are the only two customers with no orders on the order table
## 6c
anti_join(customers, orders, by = "customer_id")
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
## 7
## 7a
# I would use left join and have the customers on the left because left join displays all results from the left table
## 7b 
#I would use semi join with customers on the left because semi join returns all data from the left table if it matches with the other table
## 7c
# semi_join(customers, orders, by = "customer_id") and left_join(customers, orders, by = "customer_id")
## 7d
semi_join(customers, orders, by = "customer_id")
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
left_join(customers, orders, by = "customer_id")
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## Challenge question 8
left_join(customers, orders, by ="customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarise(order_id = sum(!is.na(order_id)), amount = sum(amount, na.rm = TRUE)) 
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
## # A tibble: 5 × 5
## # Groups:   customer_id, name [5]
##   customer_id name    city        order_id amount
##         <dbl> <chr>   <chr>          <int>  <dbl>
## 1           1 Alice   New York           1   1200
## 2           2 Bob     Los Angeles        2   2300
## 3           3 Charlie Chicago            1    300
## 4           4 David   Houston            0      0
## 5           5 Eve     Phoenix            0      0