Ce document présente des exercices de manipulation de chaînes (stringr) et de text mining (tidytext).
data_textuelle <- tibble(
nom = c("Mr Eric Legrand", "Mme Anabelle Dupont", "Mme Fatou Slaoui"),
adresse = c("12 rue Archimède", "47 avenue de la Libération", "3 rue des Fleurs"),
ville = c("Niort", "Poitiers", "Marseille")
)
datatable(data_textuelle, options = list(pageLength = 5))
exo1 <- data_textuelle %>%
transmute(
nom_upper = str_to_upper(nom),
nom_lower = str_to_lower(nom),
nom_title = str_to_title(nom),
adresse_upper = str_to_upper(adresse),
ville_title = str_to_title(ville)
)
datatable(exo1, options = list(pageLength = 5))
exo2 <- data_textuelle %>%
mutate(adresse_ville = paste(adresse, str_to_upper(ville)))
datatable(exo2, options = list(pageLength = 5))
exo3 <- data_textuelle %>%
mutate(
full_sep = paste(nom, adresse, ville, sep = "-"),
full_pipe = paste(nom, adresse, ville, sep = "|")
)
datatable(exo3, options = list(pageLength = 5))
liste_split <- str_split(exo3$full_pipe, "\\|")
liste_split
## [[1]]
## [1] "Mr Eric Legrand" "12 rue Archimède" "Niort"
##
## [[2]]
## [1] "Mme Anabelle Dupont" "47 avenue de la Libération"
## [3] "Poitiers"
##
## [[3]]
## [1] "Mme Fatou Slaoui" "3 rue des Fleurs" "Marseille"
exo5 <- data_textuelle %>%
separate(
nom,
into = c("genre", "prenom", "nom_famille"),
sep = " ",
extra = "merge",
fill = "right"
)
datatable(exo5, options = list(pageLength = 5))
data_corpus <- tibble(
id = 1:3,
texte = c(
"La data science est un domaine interdisciplinaire.",
"Le text mining permet d’analyser des documents.",
"Les ingénieurs utilisent R et Python pour le NLP."
)
)
datatable(data_corpus, options = list(pageLength = 5))
exo7 <- data_corpus %>%
mutate(
texte_clean = texte %>%
str_to_lower() %>%
str_remove_all("[[:punct:]]") %>%
str_remove_all("[[:digit:]]")
)
datatable(exo7, options = list(pageLength = 5))
tokens <- exo7 %>%
select(id, texte_clean) %>%
unnest_tokens(word, texte_clean)
datatable(tokens, options = list(pageLength = 10))
freq <- tokens %>%
count(word, sort = TRUE)
datatable(freq, options = list(pageLength = 10))
top10 <- freq %>% slice_max(n, n = 10)
ggplot(top10, aes(x = reorder(word, n), y = n)) +
geom_col(fill = "#2C7FB8") +
coord_flip() +
labs(
title = "Top 10 des mots les plus fréquents",
x = "Mot",
y = "Fréquence"
) +
theme_minimal()
wordcloud2(freq, size = 0.8)
corpus9 <- "Tom was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do : once or twice he had peeped into the book his sister was reading, but it had no pictures or conversations in it, and what is the use of a book thought Alice without pictures or conversation"
tokens9 <- tibble(texte = corpus9) %>%
mutate(texte = str_to_lower(texte)) %>%
unnest_tokens(word, texte)
freq9 <- tokens9 %>%
count(word, sort = TRUE)
total_mots9 <- nrow(tokens9)
proba9 <- freq9 %>%
mutate(proba = n / total_mots9)
cibles <- c("of", "tom", "was", "to")
proba_cibles <- proba9 %>%
filter(word %in% cibles)
proba_cibles
ggplot(proba_cibles, aes(x = word, y = proba)) +
geom_col(fill = "#F03B20") +
labs(
title = "Probabilité d’apparition de mots ciblés",
x = "Mot",
y = "Probabilité"
) +
theme_minimal()
id_liberation <- str_which(data_textuelle$adresse, "Libération")
id_liberation
## [1] 2
nb_liberation <- length(id_liberation)
nb_liberation
## [1] 1
exo11 <- str_extract(data_textuelle$adresse, "Libération")
exo11
## [1] NA "Libération" NA
exo12 <- data_textuelle %>%
mutate(
nom2 = str_replace(nom, "^Mr\\b", "M."),
adresse2 = str_replace_all(adresse, c("\\brue\\b" = "Rue", "\\bavenue\\b" = "Avenue")),
num_rue = str_extract(adresse, "\\d+")
)
datatable(exo12, options = list(pageLength = 5))
exo13 <- data_textuelle %>%
mutate(
adresse_pad = str_pad(adresse, width = 30, side = "both"),
adresse_trim = str_trim(adresse_pad, side = "both")
)
datatable(exo13, options = list(pageLength = 5))
exo14 <- data_textuelle %>%
mutate(
has_num = str_detect(adresse, "\\d+"),
num = str_extract(adresse, "\\d+")
)
datatable(exo14, options = list(pageLength = 5))
df15 <- tibble(
Id = 1:4,
Montant = c("10$", "20$", "40$", "50$")
)
df15 <- df15 %>%
mutate(
has_dollar = str_detect(Montant, fixed("$")),
Prix = as.numeric(str_replace(Montant, fixed("$"), ""))
)
moyenne_prix15 <- mean(df15$Prix)
datatable(df15, options = list(pageLength = 5))
moyenne_prix15
## [1] 30
flights + graphiqueexo16_case <- flights %>%
mutate(
retard_dep = dep_delay > 0,
retard_extreme = dep_delay >= 60,
on_time = dep_delay == 0
)
exo16_case %>%
select(dep_delay, retard_dep, retard_extreme, on_time) %>%
head(10)
resume_retards <- exo16_case %>%
summarise(
vols_total = n(),
pct_retard = mean(retard_dep, na.rm = TRUE),
pct_extreme = mean(retard_extreme, na.rm = TRUE),
pct_on_time = mean(on_time, na.rm = TRUE)
) %>%
pivot_longer(cols = starts_with("pct_"),
names_to = "type",
values_to = "valeur") %>%
mutate(
type = recode(type,
pct_retard = "Vols en retard",
pct_extreme = "Retards extrêmes (>=60min)",
pct_on_time = "À l'heure")
)
resume_retards
ggplot(resume_retards, aes(x = type, y = valeur)) +
geom_col(fill = "#31A354") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
title = "Proportion de vols à l’heure / en retard",
x = "",
y = "Proportion"
) +
theme_minimal()
Ce TP illustre : - la manipulation de chaînes (casse, concaténation, séparation, extraction, remplacement), - le nettoyage d’un corpus, - la tokenisation et l’analyse fréquentielle, - la présentation de résultats avec des tableaux et des graphiques dans un rapport reproductible.