Module 10: Apply it to your data 9

Import your data

age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv')

## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl  (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 14 Tools

Detect Matches

age_gaps %>%
    summarise(sum(str_detect(release_year, "19$")))

## # A tibble: 1 × 1
##   `sum(str_detect(release_year, "19$"))`
##                                    <int>
## 1                                     20

str_detect(age_gaps$release_year, "19$") %>% head(10)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(str_detect(age_gaps$release_year, "19$"))

## [1] 20

mean(str_detect(age_gaps$release_year, "19$"))

## [1] 0.01731602

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|purple"

# Extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)

##  [1] "blue"   "blue"   "red"    "red"    "red"    "blue"   "yellow" "red"   
##  [9] "red"    "green"  "red"    "red"    "blue"   "red"    "red"    "red"   
## [17] "red"    "blue"   "red"    "blue"   "red"    "green"  "red"    "red"   
## [25] "red"    "red"    "red"    "red"    "green"  "red"    "green"  "red"   
## [33] "purple" "green"  "red"    "red"    "red"    "red"    "red"    "blue"  
## [41] "red"    "blue"   "red"    "red"    "red"    "red"    "green"  "green" 
## [49] "green"  "red"    "red"    "yellow" "red"    "orange" "red"    "red"   
## [57] "red"

# Extract with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

Replacing matches

age_gaps %>% mutate(release_year_rev = release_year %>% str_replace("[A-Z]", "-"))

## # A tibble: 1,155 × 14
##    movie_name    release_year director age_difference couple_number actor_1_name
##    <chr>                <dbl> <chr>             <dbl>         <dbl> <chr>       
##  1 Harold and M…         1971 Hal Ash…             52             1 Ruth Gordon 
##  2 Venus                 2006 Roger M…             50             1 Peter O'Too…
##  3 The Quiet Am…         2002 Phillip…             49             1 Michael Cai…
##  4 The Big Lebo…         1998 Joel Co…             45             1 David Huddl…
##  5 Beginners             2010 Mike Mi…             43             1 Christopher…
##  6 Poison Ivy            1992 Katt Sh…             42             1 Tom Skerritt
##  7 Whatever Wor…         2009 Woody A…             40             1 Larry David 
##  8 Entrapment            1999 Jon Ami…             39             1 Sean Connery
##  9 Husbands and…         1992 Woody A…             38             1 Woody Allen 
## 10 Magnolia              1999 Paul Th…             38             1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 8 more variables: actor_2_name <chr>, character_1_gender <chr>,
## #   character_2_gender <chr>, actor_1_birthdate <date>,
## #   actor_2_birthdate <date>, actor_1_age <dbl>, actor_2_age <dbl>,
## #   release_year_rev <chr>

age_gaps %>% mutate(release_year_rev = release_year %>% str_replace_all("[A-Z]", "-"))

## # A tibble: 1,155 × 14
##    movie_name    release_year director age_difference couple_number actor_1_name
##    <chr>                <dbl> <chr>             <dbl>         <dbl> <chr>       
##  1 Harold and M…         1971 Hal Ash…             52             1 Ruth Gordon 
##  2 Venus                 2006 Roger M…             50             1 Peter O'Too…
##  3 The Quiet Am…         2002 Phillip…             49             1 Michael Cai…
##  4 The Big Lebo…         1998 Joel Co…             45             1 David Huddl…
##  5 Beginners             2010 Mike Mi…             43             1 Christopher…
##  6 Poison Ivy            1992 Katt Sh…             42             1 Tom Skerritt
##  7 Whatever Wor…         2009 Woody A…             40             1 Larry David 
##  8 Entrapment            1999 Jon Ami…             39             1 Sean Connery
##  9 Husbands and…         1992 Woody A…             38             1 Woody Allen 
## 10 Magnolia              1999 Paul Th…             38             1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 8 more variables: actor_2_name <chr>, character_1_gender <chr>,
## #   character_2_gender <chr>, actor_1_birthdate <date>,
## #   actor_2_birthdate <date>, actor_1_age <dbl>, actor_2_age <dbl>,
## #   release_year_rev <chr>

Module 10: Apply it to your data 9

Olivia Pendergast

Import your data

Chapter 14

Tools

Detect Matches

Extract matches

Replacing matches