# excel file
data <- read_excel("../00_data/my data q&a.xlsx")
data
## # A tibble: 269,732 × 15
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1 A Dijia… M 24 180 80 China CHN 1992… 1992 Summer Barc…
## 2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…
## 3 3 Gunnar … M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
## 4 4 Edgar L… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
## 5 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 6 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 7 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 8 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 9 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## 10 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## # ℹ 269,722 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
df <- data.frame(
age = c(25, 25, 25, 33, 15),
height = c(NA, 196, 190, 173, 149),
weight = c(NA, 86, 90, 73, 45)
)
df %>%
summarise(count = sum(str_detect(as.character(age), "5$")))
## count
## 1 4
# Check which ages end with "5"
df %>% summarise(sum(str_detect(as.character(age), "5$")))
## sum(str_detect(as.character(age), "5$"))
## 1 4
# Or individually:
str_detect(as.character(df$age), "5$") # TRUE/FALSE vector
## [1] TRUE TRUE TRUE FALSE TRUE
sum(str_detect(as.character(df$age), "5$")) # Count how many
## [1] 4
mean(str_detect(as.character(df$age), "5$")) # Proportion of total
## [1] 0.8
# Add a text column for demo
df <- data.frame(
id = 1:6,
comment = c(
"I like the red bike.",
"She wore a green dress.",
"No colors mentioned here.",
"The sky is blue and beautiful.",
"Purple is her favorite color.",
"He brought an orange umbrella."
)
)
# Color pattern
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
# Extract rows with color words
has_colour <- df %>%
filter(str_detect(comment, colour_match))
# Extract the matched color
has_colour %>%
mutate(matched_colour = str_extract(comment, colour_match))
## id comment matched_colour
## 1 1 I like the red bike. red
## 2 2 She wore a green dress. green
## 3 4 The sky is blue and beautiful. blue
## 4 6 He brought an orange umbrella. orange
# Sample data with a string column
df <- data.frame(
id = 1:5,
code = c("A123", "B456", "C789", "Z000", "X999")
)
# Replace the first capital letter with "-"
df %>%
mutate(code_replace_first = str_replace(code, "[A-Z]", "-"))
## id code code_replace_first
## 1 1 A123 -123
## 2 2 B456 -456
## 3 3 C789 -789
## 4 4 Z000 -000
## 5 5 X999 -999
# Replace all capital letters with "-"
df %>%
mutate(code_replace_all = str_replace_all(code, "[A-Z]", "-"))
## id code code_replace_all
## 1 1 A123 -123
## 2 2 B456 -456
## 3 3 C789 -789
## 4 4 Z000 -000
## 5 5 X999 -999