Import data
# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
data_small <- data %>% select(where(is.character)) %>% head(n = 50)
data_small %>% select(first_name, last_name, birth_country)
## # A tibble: 50 × 3
## first_name last_name birth_country
## <chr> <chr> <chr>
## 1 Bryan Adams CAN
## 2 Donald Audette CAN
## 3 Eric Bertrand CAN
## 4 Jason Botterill CAN
## 5 Andrew Brunette CAN
## 6 Kelly Buchberger CAN
## 7 Hnat Domenichelli CAN
## 8 Shean Donovan CAN
## 9 Nelson Emerson CAN
## 10 Ray Ferraro CAN
## # ℹ 40 more rows
data_small
## # A tibble: 50 × 5
## first_name last_name birth_city birth_country birth_state_province
## <chr> <chr> <chr> <chr> <chr>
## 1 Bryan Adams Fort St. James CAN British Columbia
## 2 Donald Audette Laval CAN Quebec
## 3 Eric Bertrand St-Ephrem CAN Quebec
## 4 Jason Botterill Edmonton CAN Alberta
## 5 Andrew Brunette Sudbury CAN Ontario
## 6 Kelly Buchberger Langenburg CAN Saskatchewan
## 7 Hnat Domenichelli Edmonton CAN Alberta
## 8 Shean Donovan Timmins CAN Ontario
## 9 Nelson Emerson Hamilton CAN Ontario
## 10 Ray Ferraro Trail CAN British Columbia
## # ℹ 40 more rows
Detect matches
data_small %>%
filter(str_detect(first_name, "A")) %>% filter(str_detect(first_name, "A"))
## # A tibble: 2 × 5
## first_name last_name birth_city birth_country birth_state_province
## <chr> <chr> <chr> <chr> <chr>
## 1 Andrew Brunette Sudbury CAN Ontario
## 2 Andreas Karlsson Ludvika SWE NA
# Count how many first names end with "n"
data_small %>%
summarise(sum(str_detect(first_name, "n$")))
## # A tibble: 1 × 1
## `sum(str_detect(first_name, "n$"))`
## <int>
## 1 12
# Show TRUE/FALSE for each name
str_detect(data_small$first_name, "n$")
## [1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE
## [13] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [25] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE TRUE
# Total count of names ending with "n"
sum(str_detect(data_small$first_name, "n$"))
## [1] 12
# Proportion of names ending with "n"
mean(str_detect(data_small$first_name, "n$"))
## [1] 0.24
Replacing matches
data_small %>%
mutate(first_name_rev = str_replace(first_name, "^[A-Z]", "-")) %>%
select(first_name, first_name_rev)
## # A tibble: 50 × 2
## first_name first_name_rev
## <chr> <chr>
## 1 Bryan -ryan
## 2 Donald -onald
## 3 Eric -ric
## 4 Jason -ason
## 5 Andrew -ndrew
## 6 Kelly -elly
## 7 Hnat -nat
## 8 Shean -hean
## 9 Nelson -elson
## 10 Ray -ay
## # ℹ 40 more rows
data_small %>%
mutate(first_name_rev = str_replace_all(first_name, "[A-Z]", "-")) %>%
select(first_name, first_name_rev)
## # A tibble: 50 × 2
## first_name first_name_rev
## <chr> <chr>
## 1 Bryan -ryan
## 2 Donald -onald
## 3 Eric -ric
## 4 Jason -ason
## 5 Andrew -ndrew
## 6 Kelly -elly
## 7 Hnat -nat
## 8 Shean -hean
## 9 Nelson -elson
## 10 Ray -ay
## # ℹ 40 more rows