Análisis de palabras de IA Talks

R.Pacheco

1/24/2022

Este documento tiene como finalidad analizar a través de un conteo de palabras y análisis de frecuencia en las charlas. Para este caso se hace el análisis de la charla “¿Cómo encontrar trabajo y la educación de tus sueños con Inteligencia Artificial?” impartida por Pato Bichara el 9 de Noviembre de 2021. (El conteo de las palabras es en inglés para mayor versatilidad)

Paquetes

library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr")

embed_url("https://www.youtube.com/watch?v=VLkiR0jL71o")

Principales ecuaciones utilizadas

Si lo que se busca es incluir ecuaciones en texto, sería así: E=mc2

Para explicar una ecuación se incluye de la siguiente forma:

(1) E = mc2

En la ecuación (1) tenemos que:

E = Energía m = masa c = velocidad de la luz

Concepto de PLN

Análisis morfológico o léxico Análisis sintáctico. Análisis semántico Análisis pragmático. En el siguiente enlace pueden ver un articulo explicando mas acerca de esto

Funciones

FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Datos

setwd("~/ESTADISTICA")
IA <- readLines ("IA_TALK_Eng.txt")
head(IA)

## [1] "for questions and answers you can"    
## [2] ""                                     
## [3] "write them in johnny's part and"      
## [4] ""                                     
## [5] "after them they will be redirected in"
## [6] ""

Conteo de lineas

intLineCount <- length(IA)
intLineCount

## [1] 2256

Palabras por linea

# separar
lstUNPrfLines <- str_split(IA," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea 
mean(vciUNPrfWperL)

## [1] 3.845745

Conteo de palabras

# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir 
intWordCount

## [1] 8676

Mostrar palabras

head(vcsUNPrfWords,100)

##   [1] "for"            "questions"      "and"            "answers"       
##   [5] "you"            "can"            ""               "write"         
##   [9] "them"           "in"             "johnny's"       "part"          
##  [13] "and"            ""               "after"          "them"          
##  [17] "they"           "will"           "be"             "redirected"    
##  [21] "in"             ""               "our"            "satisfaction"  
##  [25] "survey"         "and"            "be"             "able"          
##  [29] "to"             ""               "receive"        "the"           
##  [33] "memory"         "of"             "this"           "session"       
##  [37] ""               "as"             "well"           "as"            
##  [41] "other"          "resources"      "for"            "innovators"    
##  [45] ""               "and"            "well"           "we"            
##  [49] "want"           "to"             "talk"           "a"             
##  [53] "little"         ""               "about"          "the"           
##  [57] "initiative"     "to"             "make"           "it"            
##  [61] "jalisco"        "and"            ""               "we"            
##  [65] "are"            "going"          "to"             "present"       
##  [69] "you"            "a"              "little"         "video"         
##  [73] "thanks"         ""               "perla"          "in"            
##  [77] "jalisco"        "is"             "an"             "initiative"    
##  [81] ""               "led"            "by"             "the"           
##  [85] "inter-american" ""               "development"    "bank"          
##  [89] "the"            "monterrey"      "technology"     "bank"          
##  [93] ""               "in"             "guadalajara"    "the"           
##  [97] "government"     "of"             "jalisco"        "and"

Limpieza de palabras

# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)

##   [1] "for"           "questions"     "and"           "answers"      
##   [5] "you"           "can"           "write"         "them"         
##   [9] "in"            "johnnys"       "part"          "and"          
##  [13] "after"         "them"          "they"          "will"         
##  [17] "be"            "redirected"    "in"            "our"          
##  [21] "satisfaction"  "survey"        "and"           "be"           
##  [25] "able"          "to"            "receive"       "the"          
##  [29] "memory"        "of"            "this"          "session"      
##  [33] "as"            "well"          "as"            "other"        
##  [37] "resources"     "for"           "innovators"    "and"          
##  [41] "well"          "we"            "want"          "to"           
##  [45] "talk"          "a"             "little"        "about"        
##  [49] "the"           "initiative"    "to"            "make"         
##  [53] "it"            "jalisco"       "and"           "we"           
##  [57] "are"           "going"         "to"            "present"      
##  [61] "you"           "a"             "little"        "video"        
##  [65] "thanks"        "perla"         "in"            "jalisco"      
##  [69] "is"            "an"            "initiative"    "led"          
##  [73] "by"            "the"           "interamerican" "development"  
##  [77] "bank"          "the"           "monterrey"     "technology"   
##  [81] "bank"          "in"            "guadalajara"   "the"          
##  [85] "government"    "of"            "jalisco"       "and"          
##  [89] "mainz"         "perla"         "calixto"       "works"        
##  [93] "through"       "the"           "articulation"  "of"           
##  [97] "academia"      "civil"         "society"       "public"

Data frame de palabras normales

# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)

##        Words
## 1        for
## 2  questions
## 3        and
## 4    answers
## 5        you
## 6        can
## 7      write
## 8       them
## 9         in
## 10   johnnys

Conteo de palabras “normales”

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 the     349
## 2 to      279
## 3 that    260
## 4 and     258
## 5 is      191
## 6 of      187

Nube de palabras normales

wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Data frame de palabras realmente significantes

En esta sección quitaremos las “stop words”

# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)

# remover las "stop words" o palabras comunes como conjunciones 

vcsCmnWords <- c("the", "that", "and", "to", "have", "it", "not", "we", "going", "what", "for", "in", "they", "you", "is", "as", "of", "do", "all", "on", "are", "this", "at", "with", "can", "will", "those", "but", "had", "well", "us", "then", "has", "from", "your", "them", "there", "was", "if", "like", "these", "their", "our", "many", "being", "its", "bit","just", "where", "also", "another", "see", "even", "now", "already", "about","would", "want", "out")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))

# remover las palabras no significativas para este contexto 
vcsBadWords <- c("say","very","be")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)

##       Words
## 1 questions
## 2   answers
## 3     write
## 4   johnnys
## 5      part
## 6     after

Conteo de palabras significativas

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words         Freq
##   <chr>        <int>
## 1 think           56
## 2 because         40
## 3 more            39
## 4 how             33
## 5 intelligence    32
## 6 artificial      31

“cola” de palabras significativas

tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words    Freq
##   <chr>   <int>
## 1 write       1
## 2 wrong       1
## 3 yet         1
## 4 youtube     1
## 5 ypo         1
## 6 yucatãn     1

Eliminar palabras dispersas

# palabras con una frecuencia absoluta menor a 5 
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words        Freq
##   <chr>       <int>
## 1 practically     6
## 2 recruitment     6
## 3 term            6
## 4 thank           6
## 5 why             6
## 6 yes             6

Conteo final de palabras

# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal

## [1] 109

Categorización por frecuencias

# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ... 
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
# 
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)

## # A tibble: 4 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    70
## 2 "     20"    29
## 3 "     50"     9
## 4 "    100"     1

Nueva nube de palabras

wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Gráfica de barras de palabras

ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frecuencia") +
    xlab("Palabras") +
    ggtitle("Primeras 30 palabras con mayor frecuencia") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Gráfica de frecuencia

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Longitud de palabras

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")

Descargar Codigo

xfun::embed_file("AnalisisDePalabras_English.Rmd")

Download AnalisisDePalabras_English.Rmd

Descargar Datos

xfun::embed_file("IA_TALK_Eng.txt")

Download IA_TALK_Eng.txt