Module 10: Apply it to your data 9

Import your data

rating <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-25/ratings.csv', show_col_types = FALSE)
rating

## # A tibble: 21,831 × 10
##      num     id name          year  rank average bayes_average users_rated url  
##    <dbl>  <dbl> <chr>        <dbl> <dbl>   <dbl>         <dbl>       <dbl> <chr>
##  1   105  30549 Pandemic      2008   106    7.59          7.49      108975 /boa…
##  2   189    822 Carcassonne   2000   190    7.42          7.31      108738 /boa…
##  3   428     13 Catan         1995   429    7.14          6.97      108024 /boa…
##  4    72  68448 7 Wonders     2010    73    7.74          7.63       89982 /boa…
##  5   103  36218 Dominion      2008   104    7.61          7.50       81561 /boa…
##  6   191   9209 Ticket to R…  2004   192    7.41          7.30       76171 /boa…
##  7   100 178900 Codenames     2015   101    7.6           7.51       74419 /boa…
##  8     3 167791 Terraformin…  2016     4    8.42          8.27       74216 /boa…
##  9    15 173346 7 Wonders D…  2015    16    8.11          7.98       69472 /boa…
## 10    35  31260 Agricola      2007    36    7.93          7.81       66093 /boa…
## # ℹ 21,821 more rows
## # ℹ 1 more variable: thumbnail <chr>

ratings <- head(rating, 50) %>% select(name:rank)
ratings

## # A tibble: 50 × 3
##    name               year  rank
##    <chr>             <dbl> <dbl>
##  1 Pandemic           2008   106
##  2 Carcassonne        2000   190
##  3 Catan              1995   429
##  4 7 Wonders          2010    73
##  5 Dominion           2008   104
##  6 Ticket to Ride     2004   192
##  7 Codenames          2015   101
##  8 Terraforming Mars  2016     4
##  9 7 Wonders Duel     2015    16
## 10 Agricola           2007    36
## # ℹ 40 more rows

Tools

Detect matches

ratings$year

##  [1] 2008 2000 1995 2010 2008 2004 2015 2016 2015 2007 2002 2014 2016 2009 2005
## [16] 2017 2011 2012 2004 2019 2008 2014 2011 2000 2007 2012 2017 2008 2015 2010
## [31] 2001 2005 2012 2010 2014 2009 1997 2004 1999 2012 2014 2013 2011 2015 2005
## [46] 2009 2012 2016 2015 2015

# Must use wildcard to prevent year 2019
str_detect(ratings$year, "19.")

##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE

sum(str_detect(ratings$year, "19."))

## [1] 3

ratings %>% 
  summarise(amt_1900s = sum(str_detect(year, "19.")))

## # A tibble: 1 × 1
##   amt_1900s
##       <int>
## 1         3

Extract matches

ratings %>%
  mutate(col_1900s = str_extract(year, "19.")) %>%
  filter(!is.na(col_1900s))

## # A tibble: 3 × 4
##   name         year  rank col_1900s
##   <chr>       <dbl> <dbl> <chr>    
## 1 Catan        1995   429 199      
## 2 Bohnanza     1997   473 199      
## 3 Lost Cities  1999   324 199

Replacing matches

ratings %>%
  mutate(y2k_killed_1900s = str_replace(year, "19.", "200")) %>%
  select(name, rank:y2k_killed_1900s)

## # A tibble: 50 × 3
##    name               rank y2k_killed_1900s
##    <chr>             <dbl> <chr>           
##  1 Pandemic            106 2008            
##  2 Carcassonne         190 2000            
##  3 Catan               429 2005            
##  4 7 Wonders            73 2010            
##  5 Dominion            104 2008            
##  6 Ticket to Ride      192 2004            
##  7 Codenames           101 2015            
##  8 Terraforming Mars     4 2016            
##  9 7 Wonders Duel       16 2015            
## 10 Agricola             36 2007            
## # ℹ 40 more rows

Module 10: Apply it to your data 9

Simon Champney

Import your data

Chapter 14

Tools

Detect matches

Extract matches

Replacing matches