knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) # for importing excel files
library(janitor) # cleaning data
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Import data

survivalists <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-24/survivalists.csv')
## Rows: 94 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, gender, city, state, country, reason_tapped_out, reason_cate...
## dbl  (5): season, age, result, days_lasted, day_linked_up
## lgl  (1): medically_evacuated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loadouts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-24/loadouts.csv')
## Rows: 940 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): version, name, item_detailed, item
## dbl (2): season, item_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Make data small

set.seed(1234)
survivalists_small <- survivalists %>% select(1:3) %>% sample_n(10)

loadouts_small <- loadouts %>% select(season, name, item) %>% sample_n(10)

survivalists_small
## # A tibble: 10 × 3
##    season name                 age
##     <dbl> <chr>              <dbl>
##  1      3 Britt Ahart           40
##  2      8 Nate Weber            47
##  3      3 Carleigh Fairchild    28
##  4      1 Chris Weatherman      41
##  5      1 Dustin Feher          37
##  6      4 Brody Wilkes          33
##  7      2 Randy Champagne       28
##  8      1 Lucas Miller          32
##  9      9 Karie Lee Knoke       57
## 10      7 Joe Nicholas          31
loadouts_small
## # A tibble: 10 × 3
##    season name             item         
##     <dbl> <chr>            <chr>        
##  1      1 Chris Weatherman Knife        
##  2      9 Jessie Krebs     Trapping wire
##  3      3 Jim Shields      Rations      
##  4      4 Shannon Bosdell  Tarp         
##  5      2 Nicole Apelian   Fishing gear 
##  6      6 Barry Karcher    Saw          
##  7      1 Alan Kay         Pot          
##  8      9 Tom Garstang     Saw          
##  9      7 Kielyn Marrone   Trapping wire
## 10      6 Woniya Thibeault Pot

Inner Join

Resulting dataset has one row and four columns. “Age” from survivalists data set, and “item” is from loadouts dataset.

survivalists_small %>% inner_join(loadouts_small, by = c("name", "season"))
## # A tibble: 1 × 4
##   season name               age item 
##    <dbl> <chr>            <dbl> <chr>
## 1      1 Chris Weatherman    41 Knife

Left Join

Resulting dataset has 10 rows and 4 columns. Since it’s a left join, it has all data from “Survivalists_small”, and added Chris Weatherman’s item in the “item” column since he was common in both datasets.

survivalists_small %>% left_join(loadouts_small, by = c("name", "season"))
## # A tibble: 10 × 4
##    season name                 age item 
##     <dbl> <chr>              <dbl> <chr>
##  1      3 Britt Ahart           40 <NA> 
##  2      8 Nate Weber            47 <NA> 
##  3      3 Carleigh Fairchild    28 <NA> 
##  4      1 Chris Weatherman      41 Knife
##  5      1 Dustin Feher          37 <NA> 
##  6      4 Brody Wilkes          33 <NA> 
##  7      2 Randy Champagne       28 <NA> 
##  8      1 Lucas Miller          32 <NA> 
##  9      9 Karie Lee Knoke       57 <NA> 
## 10      7 Joe Nicholas          31 <NA>

Right Join

Resulting dataset has 10 rows and 4 columns. Since it’s a right join, it has all data from “loadouts_small”, and shows Chris Weatherman’s age in the “age” column.

survivalists_small %>% right_join(loadouts_small, by = c("name", "season"))
## # A tibble: 10 × 4
##    season name               age item         
##     <dbl> <chr>            <dbl> <chr>        
##  1      1 Chris Weatherman    41 Knife        
##  2      9 Jessie Krebs        NA Trapping wire
##  3      3 Jim Shields         NA Rations      
##  4      4 Shannon Bosdell     NA Tarp         
##  5      2 Nicole Apelian      NA Fishing gear 
##  6      6 Barry Karcher       NA Saw          
##  7      1 Alan Kay            NA Pot          
##  8      9 Tom Garstang        NA Saw          
##  9      7 Kielyn Marrone      NA Trapping wire
## 10      6 Woniya Thibeault    NA Pot

Full Join

Resulting dataset has 19 rows and 4 columns. It’s a full join so a lot of the data has “NA”. Only one of the observations matches in both datasets, so that is why there is 19 rows and not 20. The matching observation was for Chris Weatherman, and we can see that his observation matched because all of his column information is filled out without “NA”.

survivalists_small %>% full_join(loadouts_small, by = c("name", "season"))
## # A tibble: 19 × 4
##    season name                 age item         
##     <dbl> <chr>              <dbl> <chr>        
##  1      3 Britt Ahart           40 <NA>         
##  2      8 Nate Weber            47 <NA>         
##  3      3 Carleigh Fairchild    28 <NA>         
##  4      1 Chris Weatherman      41 Knife        
##  5      1 Dustin Feher          37 <NA>         
##  6      4 Brody Wilkes          33 <NA>         
##  7      2 Randy Champagne       28 <NA>         
##  8      1 Lucas Miller          32 <NA>         
##  9      9 Karie Lee Knoke       57 <NA>         
## 10      7 Joe Nicholas          31 <NA>         
## 11      9 Jessie Krebs          NA Trapping wire
## 12      3 Jim Shields           NA Rations      
## 13      4 Shannon Bosdell       NA Tarp         
## 14      2 Nicole Apelian        NA Fishing gear 
## 15      6 Barry Karcher         NA Saw          
## 16      1 Alan Kay              NA Pot          
## 17      9 Tom Garstang          NA Saw          
## 18      7 Kielyn Marrone        NA Trapping wire
## 19      6 Woniya Thibeault      NA Pot

Semi Join

Resulting dataset has only 1 row and 3 columns. Semi Join is a filtering operation. No columns in loadouts_small will appear, but it will act as a filter for the survivalists_small dataset which is why Chris Weatherman appears (his data can also be found in the loadouts_small dataset, so the filter selected him to be shown).

survivalists_small %>% semi_join(loadouts_small, by = c("name", "season"))
## # A tibble: 1 × 3
##   season name               age
##    <dbl> <chr>            <dbl>
## 1      1 Chris Weatherman    41

Anti Join

Resulting dataset shows 9 rows and 3 columns. Code is filtering out matching observations that occur in both survivalists_small and loadouts_small. The matching observation is Chris Weatherman which is no longer appearing in the output.

survivalists_small %>% anti_join(loadouts_small, by = c("name", "season"))
## # A tibble: 9 × 3
##   season name                 age
##    <dbl> <chr>              <dbl>
## 1      3 Britt Ahart           40
## 2      8 Nate Weber            47
## 3      3 Carleigh Fairchild    28
## 4      1 Dustin Feher          37
## 5      4 Brody Wilkes          33
## 6      2 Randy Champagne       28
## 7      1 Lucas Miller          32
## 8      9 Karie Lee Knoke       57
## 9      7 Joe Nicholas          31