Import your data

olympics <- read_excel("~/Desktop/PSU_DAT3000_IntroToDA/07_module10/myData.xlsx")

Chapter 14

Tools

Detect matches

# How many athletes competed in Summer Olympics?
olympics %>% summarise(sum(str_detect(season, "Summer")))
## # A tibble: 1 × 1
##   `sum(str_detect(season, "Summer"))`
##                                 <int>
## 1                              222552
# What proportion of entries are Winter Olympics?
olympics %>% summarise(mean(str_detect(season, "Winter")))
## # A tibble: 1 × 1
##   `mean(str_detect(season, "Winter"))`
##                                  <dbl>
## 1                                0.179
# Filter to athletes from teams starting with "United"
olympics %>% filter(str_detect(team, "^United"))
## # A tibble: 19,046 × 15
##       id name     sex   age   height weight team  noc   games  year season city 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
##  1     6 Per Knu… M     31    188    75     Unit… USA   1992…  1992 Winter Albe…
##  2     6 Per Knu… M     31    188    75     Unit… USA   1992…  1992 Winter Albe…
##  3     6 Per Knu… M     31    188    75     Unit… USA   1992…  1992 Winter Albe…
##  4     6 Per Knu… M     31    188    75     Unit… USA   1992…  1992 Winter Albe…
##  5     6 Per Knu… M     33    188    75     Unit… USA   1994…  1994 Winter Lill…
##  6     6 Per Knu… M     33    188    75     Unit… USA   1994…  1994 Winter Lill…
##  7     6 Per Knu… M     33    188    75     Unit… USA   1994…  1994 Winter Lill…
##  8     6 Per Knu… M     33    188    75     Unit… USA   1994…  1994 Winter Lill…
##  9     7 John Aa… M     31    183    72     Unit… USA   1992…  1992 Winter Albe…
## 10     7 John Aa… M     31    183    72     Unit… USA   1992…  1992 Winter Albe…
## # ℹ 19,036 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
# Filter to athletes whose name contains a middle initial (e.g. "A.")
olympics %>% filter(str_detect(name, "\\b[A-Z]\\."))
## # A tibble: 4,958 × 15
##       id name     sex   age   height weight team  noc   games  year season city 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
##  1   281 S. Abdu… M     NA    NA     NA     India IND   1928…  1928 Summer Amst…
##  2   281 S. Abdu… M     NA    NA     NA     India IND   1928…  1928 Summer Amst…
##  3   300 A. Abdu… M     NA    178    70     Iraq  IRQ   1960…  1960 Summer Roma 
##  4   300 A. Abdu… M     NA    178    70     Iraq  IRQ   1960…  1960 Summer Roma 
##  5   317 Rooseve… M     24    181    73     Phil… PHI   1968…  1968 Summer Mexi…
##  6   317 Rooseve… M     24    181    73     Phil… PHI   1968…  1968 Summer Mexi…
##  7   317 Rooseve… M     24    181    73     Phil… PHI   1968…  1968 Summer Mexi…
##  8   317 Rooseve… M     24    181    73     Phil… PHI   1968…  1968 Summer Mexi…
##  9   317 Rooseve… M     24    181    73     Phil… PHI   1968…  1968 Summer Mexi…
## 10   518 Guy A. … M     23    173    65     Pana… PAN   1976…  1976 Summer Mont…
## # ℹ 4,948 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
# How many entries have a Gold medal?
sum(str_detect(olympics$medal, "Gold"), na.rm = TRUE)
## [1] 13372
# What proportion of all entries have any medal?
mean(str_detect(olympics$medal, "Gold|Silver|Bronze"), na.rm = TRUE)
## [1] 0.1467379

Extract matches

# Build a medal pattern and extract medal types from the medal column
medal_types <- c("Gold", "Silver", "Bronze")
medal_match <- str_c(medal_types, collapse = "|")
medal_match
## [1] "Gold|Silver|Bronze"
# Extract the first word of each athlete's name
olympics %>%
  mutate(first_name = str_extract(name, "^[^ ]+")) %>%
  select(name, first_name) %>%
  head(10)
## # A tibble: 10 × 2
##    name                     first_name
##    <chr>                    <chr>     
##  1 A Dijiang                A         
##  2 A Lamusi                 A         
##  3 Gunnar Nielsen Aaby      Gunnar    
##  4 Edgar Lindenau Aabye     Edgar     
##  5 Christine Jacoba Aaftink Christine 
##  6 Christine Jacoba Aaftink Christine 
##  7 Christine Jacoba Aaftink Christine 
##  8 Christine Jacoba Aaftink Christine 
##  9 Christine Jacoba Aaftink Christine 
## 10 Christine Jacoba Aaftink Christine
# Extract the sport category from the event column
# Events follow the pattern "Sport Men's/Women's ..."
olympics %>%
  mutate(event_type = str_extract(event, "(Men's|Women's|Mixed)")) %>%
  select(event, event_type) %>%
  head(10)
## # A tibble: 10 × 2
##    event                              event_type
##    <chr>                              <chr>     
##  1 Basketball Men's Basketball        Men's     
##  2 Judo Men's Extra-Lightweight       Men's     
##  3 Football Men's Football            Men's     
##  4 Tug-Of-War Men's Tug-Of-War        Men's     
##  5 Speed Skating Women's 500 metres   Women's   
##  6 Speed Skating Women's 1,000 metres Women's   
##  7 Speed Skating Women's 500 metres   Women's   
##  8 Speed Skating Women's 1,000 metres Women's   
##  9 Speed Skating Women's 500 metres   Women's   
## 10 Speed Skating Women's 1,000 metres Women's
# Extract the host city year from the games column 
olympics %>%
  mutate(games_year = str_extract(games, "\\d{4}")) %>%
  select(games, games_year) %>%
  head(10)
## # A tibble: 10 × 2
##    games       games_year
##    <chr>       <chr>     
##  1 1992 Summer 1992      
##  2 2012 Summer 2012      
##  3 1920 Summer 1920      
##  4 1900 Summer 1900      
##  5 1988 Winter 1988      
##  6 1988 Winter 1988      
##  7 1992 Winter 1992      
##  8 1992 Winter 1992      
##  9 1994 Winter 1994      
## 10 1994 Winter 1994
# Extract all digits from the event column
olympics %>%
  mutate(event_numbers = str_extract_all(event, "\\d+", simplify = FALSE)) %>%
  select(event, event_numbers) %>%
  head(10)
## # A tibble: 10 × 2
##    event                              event_numbers
##    <chr>                              <list>       
##  1 Basketball Men's Basketball        <chr [0]>    
##  2 Judo Men's Extra-Lightweight       <chr [0]>    
##  3 Football Men's Football            <chr [0]>    
##  4 Tug-Of-War Men's Tug-Of-War        <chr [0]>    
##  5 Speed Skating Women's 500 metres   <chr [1]>    
##  6 Speed Skating Women's 1,000 metres <chr [2]>    
##  7 Speed Skating Women's 500 metres   <chr [1]>    
##  8 Speed Skating Women's 1,000 metres <chr [2]>    
##  9 Speed Skating Women's 500 metres   <chr [1]>    
## 10 Speed Skating Women's 1,000 metres <chr [2]>

Replacing matches

# Replace "Summer" and "Winter" in the season column with short words
olympics %>%
  mutate(season_short = str_replace_all(season, 
                                         c("Summer" = "Sum", 
                                           "Winter" = "Win"))) %>%
  select(season, season_short) %>%
  head(10)
## # A tibble: 10 × 2
##    season season_short
##    <chr>  <chr>       
##  1 Summer Sum         
##  2 Summer Sum         
##  3 Summer Sum         
##  4 Summer Sum         
##  5 Winter Win         
##  6 Winter Win         
##  7 Winter Win         
##  8 Winter Win         
##  9 Winter Win         
## 10 Winter Win
# Replace spaces in athlete names with underscores
olympics %>%
  mutate(name_underscore = str_replace_all(name, " ", "_")) %>%
  select(name, name_underscore) %>%
  head(10)
## # A tibble: 10 × 2
##    name                     name_underscore         
##    <chr>                    <chr>                   
##  1 A Dijiang                A_Dijiang               
##  2 A Lamusi                 A_Lamusi                
##  3 Gunnar Nielsen Aaby      Gunnar_Nielsen_Aaby     
##  4 Edgar Lindenau Aabye     Edgar_Lindenau_Aabye    
##  5 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
##  6 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
##  7 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
##  8 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
##  9 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
## 10 Christine Jacoba Aaftink Christine_Jacoba_Aaftink
# Replace only the first space in each name 
olympics %>%
  mutate(name_split = str_replace(name, " ", "_")) %>%
  select(name, name_split) %>%
  head(10)
## # A tibble: 10 × 2
##    name                     name_split              
##    <chr>                    <chr>                   
##  1 A Dijiang                A_Dijiang               
##  2 A Lamusi                 A_Lamusi                
##  3 Gunnar Nielsen Aaby      Gunnar_Nielsen Aaby     
##  4 Edgar Lindenau Aabye     Edgar_Lindenau Aabye    
##  5 Christine Jacoba Aaftink Christine_Jacoba Aaftink
##  6 Christine Jacoba Aaftink Christine_Jacoba Aaftink
##  7 Christine Jacoba Aaftink Christine_Jacoba Aaftink
##  8 Christine Jacoba Aaftink Christine_Jacoba Aaftink
##  9 Christine Jacoba Aaftink Christine_Jacoba Aaftink
## 10 Christine Jacoba Aaftink Christine_Jacoba Aaftink
# Clean up event names, remove the sport prefix before Men's/Women's/Mixed
# e.g. "Athletics Men's 100 metres" -> "Men's 100 metres"
olympics %>%
 mutate(event_clean = str_replace(event, "^.*?(Men's|Women's|Mixed)", "\\1")) %>%
  select(event, event_clean) %>%
  head(10)
## # A tibble: 10 × 2
##    event                              event_clean            
##    <chr>                              <chr>                  
##  1 Basketball Men's Basketball        Men's Basketball       
##  2 Judo Men's Extra-Lightweight       Men's Extra-Lightweight
##  3 Football Men's Football            Men's Football         
##  4 Tug-Of-War Men's Tug-Of-War        Men's Tug-Of-War       
##  5 Speed Skating Women's 500 metres   Women's 500 metres     
##  6 Speed Skating Women's 1,000 metres Women's 1,000 metres   
##  7 Speed Skating Women's 500 metres   Women's 500 metres     
##  8 Speed Skating Women's 1,000 metres Women's 1,000 metres   
##  9 Speed Skating Women's 500 metres   Women's 500 metres     
## 10 Speed Skating Women's 1,000 metres Women's 1,000 metres