Detect matches
# How many athletes competed in Summer Olympics?
olympics %>% summarise(sum(str_detect(season, "Summer")))
## # A tibble: 1 × 1
## `sum(str_detect(season, "Summer"))`
## <int>
## 1 222552
# What proportion of entries are Winter Olympics?
olympics %>% summarise(mean(str_detect(season, "Winter")))
## # A tibble: 1 × 1
## `mean(str_detect(season, "Winter"))`
## <dbl>
## 1 0.179
# Filter to athletes from teams starting with "United"
olympics %>% filter(str_detect(team, "^United"))
## # A tibble: 19,046 × 15
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 6 Per Knu… M 31 188 75 Unit… USA 1992… 1992 Winter Albe…
## 2 6 Per Knu… M 31 188 75 Unit… USA 1992… 1992 Winter Albe…
## 3 6 Per Knu… M 31 188 75 Unit… USA 1992… 1992 Winter Albe…
## 4 6 Per Knu… M 31 188 75 Unit… USA 1992… 1992 Winter Albe…
## 5 6 Per Knu… M 33 188 75 Unit… USA 1994… 1994 Winter Lill…
## 6 6 Per Knu… M 33 188 75 Unit… USA 1994… 1994 Winter Lill…
## 7 6 Per Knu… M 33 188 75 Unit… USA 1994… 1994 Winter Lill…
## 8 6 Per Knu… M 33 188 75 Unit… USA 1994… 1994 Winter Lill…
## 9 7 John Aa… M 31 183 72 Unit… USA 1992… 1992 Winter Albe…
## 10 7 John Aa… M 31 183 72 Unit… USA 1992… 1992 Winter Albe…
## # ℹ 19,036 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
# Filter to athletes whose name contains a middle initial (e.g. "A.")
olympics %>% filter(str_detect(name, "\\b[A-Z]\\."))
## # A tibble: 4,958 × 15
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 281 S. Abdu… M NA NA NA India IND 1928… 1928 Summer Amst…
## 2 281 S. Abdu… M NA NA NA India IND 1928… 1928 Summer Amst…
## 3 300 A. Abdu… M NA 178 70 Iraq IRQ 1960… 1960 Summer Roma
## 4 300 A. Abdu… M NA 178 70 Iraq IRQ 1960… 1960 Summer Roma
## 5 317 Rooseve… M 24 181 73 Phil… PHI 1968… 1968 Summer Mexi…
## 6 317 Rooseve… M 24 181 73 Phil… PHI 1968… 1968 Summer Mexi…
## 7 317 Rooseve… M 24 181 73 Phil… PHI 1968… 1968 Summer Mexi…
## 8 317 Rooseve… M 24 181 73 Phil… PHI 1968… 1968 Summer Mexi…
## 9 317 Rooseve… M 24 181 73 Phil… PHI 1968… 1968 Summer Mexi…
## 10 518 Guy A. … M 23 173 65 Pana… PAN 1976… 1976 Summer Mont…
## # ℹ 4,948 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
# How many entries have a Gold medal?
sum(str_detect(olympics$medal, "Gold"), na.rm = TRUE)
## [1] 13372
# What proportion of all entries have any medal?
mean(str_detect(olympics$medal, "Gold|Silver|Bronze"), na.rm = TRUE)
## [1] 0.1467379
Replacing matches
# Replace "Summer" and "Winter" in the season column with short words
olympics %>%
mutate(season_short = str_replace_all(season,
c("Summer" = "Sum",
"Winter" = "Win"))) %>%
select(season, season_short) %>%
head(10)
## # A tibble: 10 × 2
## season season_short
## <chr> <chr>
## 1 Summer Sum
## 2 Summer Sum
## 3 Summer Sum
## 4 Summer Sum
## 5 Winter Win
## 6 Winter Win
## 7 Winter Win
## 8 Winter Win
## 9 Winter Win
## 10 Winter Win
# Replace spaces in athlete names with underscores
olympics %>%
mutate(name_underscore = str_replace_all(name, " ", "_")) %>%
select(name, name_underscore) %>%
head(10)
## # A tibble: 10 × 2
## name name_underscore
## <chr> <chr>
## 1 A Dijiang A_Dijiang
## 2 A Lamusi A_Lamusi
## 3 Gunnar Nielsen Aaby Gunnar_Nielsen_Aaby
## 4 Edgar Lindenau Aabye Edgar_Lindenau_Aabye
## 5 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 6 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 7 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 8 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 9 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 10 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
# Replace only the first space in each name
olympics %>%
mutate(name_split = str_replace(name, " ", "_")) %>%
select(name, name_split) %>%
head(10)
## # A tibble: 10 × 2
## name name_split
## <chr> <chr>
## 1 A Dijiang A_Dijiang
## 2 A Lamusi A_Lamusi
## 3 Gunnar Nielsen Aaby Gunnar_Nielsen Aaby
## 4 Edgar Lindenau Aabye Edgar_Lindenau Aabye
## 5 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 6 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 7 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 8 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 9 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 10 Christine Jacoba Aaftink Christine_Jacoba Aaftink
# Clean up event names, remove the sport prefix before Men's/Women's/Mixed
# e.g. "Athletics Men's 100 metres" -> "Men's 100 metres"
olympics %>%
mutate(event_clean = str_replace(event, "^.*?(Men's|Women's|Mixed)", "\\1")) %>%
select(event, event_clean) %>%
head(10)
## # A tibble: 10 × 2
## event event_clean
## <chr> <chr>
## 1 Basketball Men's Basketball Men's Basketball
## 2 Judo Men's Extra-Lightweight Men's Extra-Lightweight
## 3 Football Men's Football Men's Football
## 4 Tug-Of-War Men's Tug-Of-War Men's Tug-Of-War
## 5 Speed Skating Women's 500 metres Women's 500 metres
## 6 Speed Skating Women's 1,000 metres Women's 1,000 metres
## 7 Speed Skating Women's 500 metres Women's 500 metres
## 8 Speed Skating Women's 1,000 metres Women's 1,000 metres
## 9 Speed Skating Women's 500 metres Women's 500 metres
## 10 Speed Skating Women's 1,000 metres Women's 1,000 metres