library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr", "xfun")
embed_url("https://www.youtube.com/watch?v=uPJ1R0OibAo&ab_channel=fAIrLACJalisco")
Ia contra la violencia de genero
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory}
setwd("~/ea9am")
vgenero <- readLines("violenciadegenero.txt")
head(vgenero)
## [1] "muy buenas tardes a todos y todos mi"
## [2] ""
## [3] "nombre es javier a velarde y soy parte"
## [4] ""
## [5] "del equipo de fer la cal isco trabajando"
## [6] ""
#longitud del vector
intLineCount <-length(vgenero)
intLineCount
## [1] 2094
lstUNPrfLines <- str_split(vgenero," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea
mean(vciUNPrfWperL)
## [1] 3.589303
# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir
intWordCount
## [1] 7516
head(vcsUNPrfWords, 100)
## [1] "muy" "buenas" "tardes" "a"
## [5] "todos" "y" "todos" "mi"
## [9] "" "nombre" "es" "javier"
## [13] "a" "velarde" "y" "soy"
## [17] "parte" "" "del" "equipo"
## [21] "de" "fer" "la" "cal"
## [25] "isco" "trabajando" "" "desde"
## [29] "el" "componente" "de" "emprendimiento"
## [33] "" "muchas" "gracias" "por"
## [37] "acompañarnos" "por" "la" ""
## [41] "espera" "a" "esta" "charla"
## [45] "sobre" "inteligencia" "" "artificial"
## [49] "" "les" "comparto" "que"
## [53] "al" "finalizar" "la" "sesión"
## [57] "" "tendremos" "unos" "cuantos"
## [61] "minutos" "para" "" "resolver"
## [65] "algunas" "preguntas" "y" "respuestas"
## [69] "" "y" "les" "pedimos"
## [73] "por" "favor" "que" "estas"
## [77] "" "preguntas" "las" "hagan"
## [81] "en" "la" "parte" "de"
## [85] "kiwi" "" "que" "está"
## [89] "en" "la" "sección" "de"
## [93] "comentarios" "" "aquÃ" "mismo"
## [97] "en" "el" "zumo" ""
# Convirtiendo todas las palabras a minúsculas
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# Quitar números
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remover puntuaciones
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remover espacio en blanco
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remover carateres especiales
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remover vectores vacíos
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
## [1] "muy" "buenas" "tardes" "a"
## [5] "todos" "y" "todos" "mi"
## [9] "nombre" "es" "javier" "a"
## [13] "velarde" "y" "soy" "parte"
## [17] "del" "equipo" "de" "fer"
## [21] "la" "cal" "isco" "trabajando"
## [25] "desde" "el" "componente" "de"
## [29] "emprendimiento" "muchas" "gracias" "por"
## [33] "acompaã±arnos" "por" "la" "espera"
## [37] "a" "esta" "charla" "sobre"
## [41] "inteligencia" "artificial" "les" "comparto"
## [45] "que" "al" "finalizar" "la"
## [49] "sesiã³n" "tendremos" "unos" "cuantos"
## [53] "minutos" "para" "resolver" "algunas"
## [57] "preguntas" "y" "respuestas" "y"
## [61] "les" "pedimos" "por" "favor"
## [65] "que" "estas" "preguntas" "las"
## [69] "hagan" "en" "la" "parte"
## [73] "de" "kiwi" "que" "estã"
## [77] "en" "la" "secciã³n" "de"
## [81] "comentarios" "aquã" "mismo" "en"
## [85] "el" "zumo" "antes" "de"
## [89] "iniciar" "quisiã©ramos" "mostrarles" "un"
## [93] "vãdeo" "sobre" "la" "iniciativa"
## [97] "verla" "kalish" "co" "el"
# hacer data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords) #data frame
colnames(dfrUNPrfWords) <- c("Words") #título de la columna
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words) #convertimos caracteres
# conteo de palabras normales
head(dfrUNPrfWords,10)
## Words
## 1 muy
## 2 buenas
## 3 tardes
## 4 a
## 5 todos
## 6 y
## 7 todos
## 8 mi
## 9 nombre
## 10 es
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 que 309
## 2 de 299
## 3 y 273
## 4 en 143
## 5 la 135
## 6 a 130
# nube de palabras
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
#Remoción de "stop words"
vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar","buenas","ustedes","entonces","también","porque", "pues", "como")
# remover las palabras no significativas para este contexto
vcsBadWords <- c("decir","muy","están")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
## Words
## 1 buenas
## 2 tardes
## 3 todos
## 4 todos
## 5 nombre
## 6 javier
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 que 309
## 2 como 67
## 3 los 67
## 4 para 66
## 5 pues 62
## 6 con 60
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 votaciã³n 1
## 2 wise 1
## 3 youtube 1
## 4 zacatones 1
## 5 zorra 1
## 6 zumo 1
# palabras con una frecuencia absoluta menor a 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 punto 6
## 2 sentimientos 6
## 3 tesis 6
## 4 todas 6
## 5 veces 6
## 6 ver 6
# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 129
# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ...
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
#
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 5 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 67
## 2 " 20" 33
## 3 " 50" 20
## 4 " 100" 8
## 5 " 500" 1
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frecuencia") +
xlab("Palabras") +
ggtitle("Primeras 30 palabras con mayor frecuencia") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")