Este cuaderno visualiza los patrones que siguen la palabra del día de tildes
if(!"tidyverse" %in% installed.packages()) {install.packages("tidyverse")}
if(!"koRpus" %in% installed.packages()) {install.packages("koRpus")}
if (!"koRpus.lang.es" %in% installed.packages()) {install.packages("koRpus.lang.es", repos="https://undocumeantit.github.io/repos/l10n/")}
library("tidyverse")
library("koRpus")
library("koRpus.lang.es")
library("sylly.es")
library(ggtext) # Dar color a los textos de las leyendas
locale(date_names = "es", date_format = "%AD", time_format = "%AT",
decimal_mark = ",", grouping_mark = ".", tz = "Europe/Berlin",
encoding = "UTF-8", asciify = FALSE)
## <locale>
## Numbers: 123.456,78
## Formats: %AD / %AT
## Timezone: Europe/Berlin
## Encoding: UTF-8
## <date_names>
## Days: domingo (dom.), lunes (lun.), martes (mar.), miércoles (mié.), jueves
## (jue.), viernes (vie.), sábado (sáb.)
## Months: enero (ene.), febrero (feb.), marzo (mar.), abril (abr.), mayo (may.),
## junio (jun.), julio (jul.), agosto (ago.), septiembre (sept.),
## octubre (oct.), noviembre (nov.), diciembre (dic.)
## AM/PM: a. m./p. m.
Sys.setlocale(category = "LC_ALL", locale = "spanish")
## [1] "LC_COLLATE=Spanish_Spain.1252;LC_CTYPE=Spanish_Spain.1252;LC_MONETARY=Spanish_Spain.1252;LC_NUMERIC=C;LC_TIME=Spanish_Spain.1252"
my_theme <- function(
base_size = 11,
base_color = "#5a5856",
base_family = "sans"
)
{
# Establecemos la proporción del tamaño de la letra de cada uno de los elementos
template <-
theme_bw(base_size=base_size, base_family = base_family) +
theme(
panel.border = element_rect(colour = base_color),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(
size = base_size+2,
face = "bold",
color = base_color,
vjust = 1.25,
hjust = 0.5),
plot.subtitle = element_text(size=base_size + 2, color = base_color, hjust = 0.5),
plot.caption = element_text(hjust = 1, size=base_size - 1, color = base_color),
legend.position ="top",
legend.margin = margin(0,0,0,0,unit="pt"),
legend.text = element_text(size=base_size + 1),
text = element_text(size=base_size, color = base_color),
axis.title.x = element_text(
size=base_size + 1,
vjust=0,
color = base_color,
margin = unit(c(3, 0, 0, 0), "mm")
),
axis.title.y = element_text(
size=base_size + 1,
vjust=1.25,
color = base_color,
margin = unit(c(0, 3, 0, 0), "mm")
),
axis.title.y.right=element_text(
size=base_size + 1,
color = base_color,
margin = unit(c(0, 0, 0, 3), "mm")
),
axis.text.x = element_text(size=base_size - 1, color = base_color),
axis.text.y = element_text(size=base_size-1,color = base_color),
strip.text = element_text(size=base_size + 2, color = base_color),
strip.text.x = element_text(size=base_size + 1, color = base_color),
strip.text.y = element_text(size=base_size + 1, color = base_color),
strip.background = element_rect(color = NA, fill = NA)
)
return(template)
}
url_palabras_tilde <- "https://raw.githubusercontent.com/congosto/congosto.github.io/master/datos/tildes.txt"
palabras_tilde <- read_csv (url_palabras_tilde ) %>%
# trabajar con minúsculas
mutate (palabras_con_tilde = tolower(palabras_con_tilde)) %>%
mutate(silabas = hyphen(palabras_con_tilde, hyph.pattern = "es",quiet = TRUE)@hyphen$word) %>%
# corregir hiato simple que lo hace mal la librería
mutate (silabas = gsub("([a | e | o])([a | e | o])", "\\1-\\2", silabas)) %>%
# corregir Hiato acentual que lo hace mal la librería
mutate (silabas = gsub("([í | ú ])([a | e | o])", "\\1-\\2", silabas)) %>%
mutate (silabas = gsub("([a | e | o])([í | ú ])", "\\1-\\2", silabas)) %>%
# corregir palabras que empiezan por vocal que lo hace mal la librería
mutate (silabas = gsub("([aeiouáéíóú])([bcdfghjklmnpqrstvwxyz])([aeiouáéíóú])","\\1-\\2\\3", silabas)) %>%
# calcular número de sílabas
mutate(num_silabas = str_count(silabas, "-") + 1) %>%
# calcular número de letras
mutate(num_letras = nchar(palabras_con_tilde)) %>%
# Obtenerla tilde
mutate(tilde = str_extract(silabas, "[á|é|í|ó|ú]")) %>%
# calcular cuantas silabas tiene por delante de la tilde
mutate(silaba_tilde = str_extract(silabas, "[á|é|í|ó|ú].*")) %>%
mutate(num_silabas_post_tilde = str_count(silaba_tilde, "-") + 1) %>%
mutate(tipo_acentuacion = case_when(
grepl("([í | ú ])([a | e | o])", palabras_con_tilde) ~ "Hiato acentual",
grepl("([a | e | o])([í | ú ])", palabras_con_tilde) ~ "Hiato acentual",
grepl("ón$", palabras_con_tilde) ~ "Terminación ón"
) ) %>%
mutate(tipo_acentuacion = ifelse(is.na(tipo_acentuacion),"Otros", tipo_acentuacion)) %>%
# Calcular el tipo de acento
mutate(acentuacion = case_when(
num_silabas_post_tilde == 1 ~ "Aguda",
num_silabas_post_tilde == 2 ~ "Llana",
num_silabas_post_tilde >= 3 ~ "Esdrújula"
) ) %>%
#Añadir orden de la palabra
rownames_to_column() %>%
rename ("orden" = "rowname") %>%
mutate ("orden" = as.numeric(orden)) %>%
# Contar cuantas palabras hay de cada longitud
group_by(num_letras) %>%
mutate (total_num_letras = n()) %>%
ungroup()
# ordenar por acentuación
orden_acentuacion <- c("Esdrújula","Llana","Aguda")
palabras_tilde$acentuacion <- factor(palabras_tilde$acentuacion, levels = orden_acentuacion )
# ordenar el tipo de acento
orden_tipo_acentuacion <- c("Hiato acentual","Terminación ón", "Otros")
palabras_tilde$tipo_acentuacion <- factor(palabras_tilde$tipo_acentuacion, levels = orden_tipo_acentuacion )
tilde <- palabras_tilde %>%
group_by (tilde, num_letras) %>%
summarise(
n = n(),
percent = round (n / total_num_letras * 100, 1)
) %>%
ungroup() %>%
distinct (tilde, num_letras,percent) %>%
complete(tilde, num_letras) %>%
mutate(percent = ifelse(is.na(percent), 0, percent))
ggplot(data = tilde) +
geom_tile(
aes(
x = tilde, y = as.character(num_letras),
fill = percent
),
color = "white"
) +
geom_label(
aes(
x = tilde, y = as.character(num_letras),
label = paste0(percent, "%")
),
color = "#5a5856",
label.size = 0
) +
scale_x_discrete(
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_gradient(
na.value = "grey90",
low = "#DDEAFA",
high = "#036DFA",
guide = guide_legend()
) +
labs(
title = "Tildes vs. Número de letras",
x = "",
y = "Número de letras",
color = "% Casos"
) +
coord_fixed() +
my_theme() +
theme(
legend.position = "None",
panel.background = element_rect(fill = "grey90", color = "grey50")
)
acentuacion <- palabras_tilde %>%
group_by (acentuacion, num_letras) %>%
summarise(
n = n(),
percent = round (n / total_num_letras * 100, 1),
) %>%
ungroup() %>%
distinct (acentuacion, num_letras,percent) %>%
complete(acentuacion, num_letras) %>%
mutate(percent = ifelse(is.na(percent), 0, percent))
ggplot(data = acentuacion) +
geom_tile(
aes(
x = acentuacion, y = as.character(num_letras),
fill = percent
),
color = "white"
) +
geom_label(
aes(
x = acentuacion, y = as.character(num_letras),
label = paste0(percent,"%")
),
color = "#5a5856",
label.size = 0
) +
scale_x_discrete(
expand = c(0,0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_gradient(
low = "#DDEAFA",
high = "#036DFA",
guide = guide_legend()
) +
labs(
title = "Acentuación vs. Número de letras",
x = "",
y = "Número de letras",
color = "% Casos"
) +
coord_fixed() +
my_theme() +
theme(
legend.position = "None",
)
hiato_on <- palabras_tilde %>%
group_by (tipo_acentuacion, num_letras) %>%
summarise(
n = n(),
percent = round (n / total_num_letras * 100, 1),
) %>%
ungroup() %>%
distinct (tipo_acentuacion, num_letras,percent) %>%
complete(tipo_acentuacion, num_letras) %>%
mutate(percent = ifelse(is.na(percent), 0, percent))
ggplot(data = hiato_on) +
geom_tile(
aes(
x = tipo_acentuacion, y = as.character(num_letras),
fill = percent
),
color = "white") +
geom_label(
aes(
x = tipo_acentuacion, y = as.character(num_letras),
label = paste0(percent, "%")
),
color = "#5a5856",
label.size = 0
) +
scale_x_discrete(
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_gradient(
low = "#DDEAFA",
high = "#036DFA",
guide = guide_legend()
) +
labs(
title = 'Hiato acentual y terminación "ón" vs. múmero de letras',
x = "",
y = "Número de letras",
color = "% Casos"
) +
coord_fixed() +
my_theme() +
theme(
legend.position="None"
)
color_tildes <- c("á" = "blue", "é" = "darkgreen", "í" = "darkred", "ó" = "orange", "ú" = "purple")
orden_tildes <- c("á","é","í","ó","ú")
df <- palabras_tilde %>%
distinct (orden, num_letras, tilde) %>%
complete(orden, num_letras)
ggplot(data = df) +
geom_tile(
aes(x = as.character(orden), y = as.character(num_letras)),
show.legend = FALSE,
fill = "white",
color = "grey"
) +
geom_text(
aes(
x = orden, y = as.character(num_letras),
label = tilde,
color = tilde
),
show.legend = FALSE
) +
scale_x_discrete(
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_color_manual(
values = color_tildes,
na.value = "white"
) +
labs(
title = "Patrón de tildes",
x = "",
y = "N. de letras"
) +
coord_fixed(ratio = 2) +
my_theme() +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
legend.text = element_markdown(size = 9),
legend.key.width = unit(0.2, "cm"),
legend.key.height= unit(0.2, "cm")
)
color_acentos <- c("Esdrújula" = "blue", "Llana" = "darkgreen", "Aguda" = "darkred")
df <- palabras_tilde %>%
distinct (orden, num_letras, acentuacion) %>%
complete(orden, num_letras)
ggplot(data = df) +
geom_tile(
aes(
x = as.character(orden), y = as.character(num_letras),
fill = acentuacion
),
color = "grey",
alpha = 0.5
) +
scale_x_discrete(
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_manual(
values = color_acentos,
labels = paste(
"<span style='color:",
color_acentos,
"'>",
orden_acentuacion,
"</span>"),
na.value = "white",
drop = FALSE
) +
labs(
title = "Patrón de acentuación",
x = "",
y = "N. de letras",
fill = "Acentuación") +
coord_fixed(ratio = 2) +
my_theme() +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
legend.text = element_markdown(size = 9),
legend.key.width = unit(0.2, "cm"),
legend.key.height= unit(0.2, "cm")
)
color_tipo_acentos <- c("Hiato acentual" = "blue", "Terminación ón" = "darkgreen", "Otros" = "grey50")
df <- palabras_tilde %>%
distinct (orden, num_letras, tipo_acentuacion) %>%
complete(orden, num_letras)
ggplot(data = df) +
geom_tile(
aes(
x = as.character(orden), y = as.character(num_letras),
fill = tipo_acentuacion
),
color = "grey",
alpha = 0.7
) +
scale_x_discrete(
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_manual(
values = color_tipo_acentos,
labels = paste(
"<span style='color:",
color_tipo_acentos,
"'>",
orden_tipo_acentuacion,
"</span>"),
na.value = "white",
drop = FALSE
) +
labs(
title = 'Patrón hiato acentual y terminación "ón"',
x = "",
y = "N. de letras",
fill = "Tipo de Acentuación") +
coord_fixed(ratio = 2) +
my_theme() +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
legend.text=element_markdown(size=9),
legend.key.width = unit(0.2,"cm"),
legend.key.height= unit(0.2,"cm")
)