Module 10: Apply it to your data 9

Import your data

# excel file
data <- read_excel("../00_data/my data q&a.xlsx")
data

## # A tibble: 269,732 × 15
##       id name     sex   age   height weight team  noc   games  year season city 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
##  1     1 A Dijia… M     24    180    80     China CHN   1992…  1992 Summer Barc…
##  2     2 A Lamusi M     23    170    60     China CHN   2012…  2012 Summer Lond…
##  3     3 Gunnar … M     24    NA     NA     Denm… DEN   1920…  1920 Summer Antw…
##  4     4 Edgar L… M     34    NA     NA     Denm… DEN   1900…  1900 Summer Paris
##  5     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  6     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  7     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  8     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  9     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## 10     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## # ℹ 269,722 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>

Tools

df <- data.frame(
  age = c(25, 25, 25, 33, 15),
  height = c(NA, 196, 190, 173, 149),
  weight = c(NA, 86, 90, 73, 45)
)

df %>%
  summarise(count = sum(str_detect(as.character(age), "5$")))

##   count
## 1     4

Detect matches

# Check which ages end with "5"
df %>% summarise(sum(str_detect(as.character(age), "5$")))

##   sum(str_detect(as.character(age), "5$"))
## 1                                        4

# Or individually:
str_detect(as.character(df$age), "5$")           # TRUE/FALSE vector

## [1]  TRUE  TRUE  TRUE FALSE  TRUE

sum(str_detect(as.character(df$age), "5$"))      # Count how many

## [1] 4

mean(str_detect(as.character(df$age), "5$"))     # Proportion of total

## [1] 0.8

Extract matches

# Add a text column for demo
df <- data.frame(
  id = 1:6,
  comment = c(
    "I like the red bike.",
    "She wore a green dress.",
    "No colors mentioned here.",
    "The sky is blue and beautiful.",
    "Purple is her favorite color.",
    "He brought an orange umbrella."
  )
)

# Color pattern
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")

# Extract rows with color words
has_colour <- df %>%
  filter(str_detect(comment, colour_match))

# Extract the matched color
has_colour %>%
  mutate(matched_colour = str_extract(comment, colour_match))

##   id                        comment matched_colour
## 1  1           I like the red bike.            red
## 2  2        She wore a green dress.          green
## 3  4 The sky is blue and beautiful.           blue
## 4  6 He brought an orange umbrella.         orange

Replacing matches

# Sample data with a string column
df <- data.frame(
  id = 1:5,
  code = c("A123", "B456", "C789", "Z000", "X999")
)

# Replace the first capital letter with "-"
df %>% 
  mutate(code_replace_first = str_replace(code, "[A-Z]", "-"))

##   id code code_replace_first
## 1  1 A123               -123
## 2  2 B456               -456
## 3  3 C789               -789
## 4  4 Z000               -000
## 5  5 X999               -999

# Replace all capital letters with "-"
df %>% 
  mutate(code_replace_all = str_replace_all(code, "[A-Z]", "-"))

##   id code code_replace_all
## 1  1 A123             -123
## 2  2 B456             -456
## 3  3 C789             -789
## 4  4 Z000             -000
## 5  5 X999             -999

Module 10: Apply it to your data 9

Declan Fogarty

Import your data

Chapter 14

Tools

Detect matches

Extract matches

Replacing matches