age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
age_gaps %>%
summarise(sum(str_detect(release_year, "19$")))
## # A tibble: 1 × 1
## `sum(str_detect(release_year, "19$"))`
## <int>
## 1 20
str_detect(age_gaps$release_year, "19$") %>% head(10)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(age_gaps$release_year, "19$"))
## [1] 20
mean(str_detect(age_gaps$release_year, "19$"))
## [1] 0.01731602
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
# Extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)
## [1] "blue" "blue" "red" "red" "red" "blue" "yellow" "red"
## [9] "red" "green" "red" "red" "blue" "red" "red" "red"
## [17] "red" "blue" "red" "blue" "red" "green" "red" "red"
## [25] "red" "red" "red" "red" "green" "red" "green" "red"
## [33] "purple" "green" "red" "red" "red" "red" "red" "blue"
## [41] "red" "blue" "red" "red" "red" "red" "green" "green"
## [49] "green" "red" "red" "yellow" "red" "orange" "red" "red"
## [57] "red"
# Extract with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
age_gaps %>% mutate(release_year_rev = release_year %>% str_replace("[A-Z]", "-"))
## # A tibble: 1,155 × 14
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 2 Venus 2006 Roger M… 50 1 Peter O'Too…
## 3 The Quiet Am… 2002 Phillip… 49 1 Michael Cai…
## 4 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 5 Beginners 2010 Mike Mi… 43 1 Christopher…
## 6 Poison Ivy 1992 Katt Sh… 42 1 Tom Skerritt
## 7 Whatever Wor… 2009 Woody A… 40 1 Larry David
## 8 Entrapment 1999 Jon Ami… 39 1 Sean Connery
## 9 Husbands and… 1992 Woody A… 38 1 Woody Allen
## 10 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 8 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <date>,
## # actor_2_birthdate <date>, actor_1_age <dbl>, actor_2_age <dbl>,
## # release_year_rev <chr>
age_gaps %>% mutate(release_year_rev = release_year %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 1,155 × 14
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 2 Venus 2006 Roger M… 50 1 Peter O'Too…
## 3 The Quiet Am… 2002 Phillip… 49 1 Michael Cai…
## 4 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 5 Beginners 2010 Mike Mi… 43 1 Christopher…
## 6 Poison Ivy 1992 Katt Sh… 42 1 Tom Skerritt
## 7 Whatever Wor… 2009 Woody A… 40 1 Larry David
## 8 Entrapment 1999 Jon Ami… 39 1 Sean Connery
## 9 Husbands and… 1992 Woody A… 38 1 Woody Allen
## 10 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 8 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <date>,
## # actor_2_birthdate <date>, actor_1_age <dbl>, actor_2_age <dbl>,
## # release_year_rev <chr>