Este documento tiene como finalidad analizar a través de un conteo de palabras y análisis de frecuencia en las charlas. Para este caso se hace el análisis de la charla “¿Cómo encontrar trabajo y la educación de tus sueños con Inteligencia Artificial?” impartida por Pato Bichara el 9 de Noviembre de 2021. (El conteo de las palabras es en inglés para mayor versatilidad)
Paquetes
library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr")
embed_url("https://www.youtube.com/watch?v=VLkiR0jL71o")
Principales ecuaciones utilizadas
Si lo que se busca es incluir ecuaciones en texto, sería así: E=mc2
Para explicar una ecuación se incluye de la siguiente forma:
(1) E = mc2
En la ecuación (1) tenemos que:
E = Energía m = masa c = velocidad de la luz
Concepto de PLN
Análisis morfológico o léxico Análisis sintáctico. Análisis semántico Análisis pragmático. En el siguiente enlace pueden ver un articulo explicando mas acerca de esto
Funciones
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory
}
Datos
setwd("~/ESTADISTICA")
IA <- readLines ("IA_TALK_Eng.txt")
head(IA)
## [1] "for questions and answers you can"
## [2] ""
## [3] "write them in johnny's part and"
## [4] ""
## [5] "after them they will be redirected in"
## [6] ""
Conteo de lineas
intLineCount <- length(IA)
intLineCount
## [1] 2256
Palabras por linea
# separar
lstUNPrfLines <- str_split(IA," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea
mean(vciUNPrfWperL)
## [1] 3.845745
Conteo de palabras
# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir
intWordCount
## [1] 8676
Mostrar palabras
head(vcsUNPrfWords,100)
## [1] "for" "questions" "and" "answers"
## [5] "you" "can" "" "write"
## [9] "them" "in" "johnny's" "part"
## [13] "and" "" "after" "them"
## [17] "they" "will" "be" "redirected"
## [21] "in" "" "our" "satisfaction"
## [25] "survey" "and" "be" "able"
## [29] "to" "" "receive" "the"
## [33] "memory" "of" "this" "session"
## [37] "" "as" "well" "as"
## [41] "other" "resources" "for" "innovators"
## [45] "" "and" "well" "we"
## [49] "want" "to" "talk" "a"
## [53] "little" "" "about" "the"
## [57] "initiative" "to" "make" "it"
## [61] "jalisco" "and" "" "we"
## [65] "are" "going" "to" "present"
## [69] "you" "a" "little" "video"
## [73] "thanks" "" "perla" "in"
## [77] "jalisco" "is" "an" "initiative"
## [81] "" "led" "by" "the"
## [85] "inter-american" "" "development" "bank"
## [89] "the" "monterrey" "technology" "bank"
## [93] "" "in" "guadalajara" "the"
## [97] "government" "of" "jalisco" "and"
Limpieza de palabras
# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
## [1] "for" "questions" "and" "answers"
## [5] "you" "can" "write" "them"
## [9] "in" "johnnys" "part" "and"
## [13] "after" "them" "they" "will"
## [17] "be" "redirected" "in" "our"
## [21] "satisfaction" "survey" "and" "be"
## [25] "able" "to" "receive" "the"
## [29] "memory" "of" "this" "session"
## [33] "as" "well" "as" "other"
## [37] "resources" "for" "innovators" "and"
## [41] "well" "we" "want" "to"
## [45] "talk" "a" "little" "about"
## [49] "the" "initiative" "to" "make"
## [53] "it" "jalisco" "and" "we"
## [57] "are" "going" "to" "present"
## [61] "you" "a" "little" "video"
## [65] "thanks" "perla" "in" "jalisco"
## [69] "is" "an" "initiative" "led"
## [73] "by" "the" "interamerican" "development"
## [77] "bank" "the" "monterrey" "technology"
## [81] "bank" "in" "guadalajara" "the"
## [85] "government" "of" "jalisco" "and"
## [89] "mainz" "perla" "calixto" "works"
## [93] "through" "the" "articulation" "of"
## [97] "academia" "civil" "society" "public"
Data frame de palabras normales
# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)
## Words
## 1 for
## 2 questions
## 3 and
## 4 answers
## 5 you
## 6 can
## 7 write
## 8 them
## 9 in
## 10 johnnys
Conteo de palabras “normales”
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 the 349
## 2 to 279
## 3 that 260
## 4 and 258
## 5 is 191
## 6 of 187
Nube de palabras normales
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Data frame de palabras realmente significantes
- En esta sección quitaremos las “stop words”
# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
# remover las "stop words" o palabras comunes como conjunciones
vcsCmnWords <- c("the", "that", "and", "to", "have", "it", "not", "we", "going", "what", "for", "in", "they", "you", "is", "as", "of", "do", "all", "on", "are", "this", "at", "with", "can", "will", "those", "but", "had", "well", "us", "then", "has", "from", "your", "them", "there", "was", "if", "like", "these", "their", "our", "many", "being", "its", "bit","just", "where", "also", "another", "see", "even", "now", "already", "about","would", "want", "out")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
# remover las palabras no significativas para este contexto
vcsBadWords <- c("say","very","be")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
## Words
## 1 questions
## 2 answers
## 3 write
## 4 johnnys
## 5 part
## 6 after
Conteo de palabras significativas
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 think 56
## 2 because 40
## 3 more 39
## 4 how 33
## 5 intelligence 32
## 6 artificial 31
“cola” de palabras significativas
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 write 1
## 2 wrong 1
## 3 yet 1
## 4 youtube 1
## 5 ypo 1
## 6 yucatãn 1
Eliminar palabras dispersas
# palabras con una frecuencia absoluta menor a 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 practically 6
## 2 recruitment 6
## 3 term 6
## 4 thank 6
## 5 why 6
## 6 yes 6
Conteo final de palabras
# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 109
Categorización por frecuencias
# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ...
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
#
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 4 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 70
## 2 " 20" 29
## 3 " 50" 9
## 4 " 100" 1
Nueva nube de palabras
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Gráfica de barras de palabras
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frecuencia") +
xlab("Palabras") +
ggtitle("Primeras 30 palabras con mayor frecuencia") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
Gráfica de frecuencia
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
Longitud de palabras
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")
Descargar Codigo
xfun::embed_file("AnalisisDePalabras_English.Rmd")