Análisis de conteo de palabras para conocer su frecuencia

Video de youtube de la charla

embed_url("https://youtu.be/Fzq0EQ2m6EQ")

Este es un ejercicio de la materia de probabilidad y estadística de la clase de las 9:00 a.m. (LMV) del departamento de matemáticas de ITSON.
El presente documento tiene como objetivo analizar a través de un conteo de palabras y análisis de frecuencia las charlas.
Para este caso se hace el análisis de la charla “Redefiniendo retos en oportunidades con IA” impartida por Michael Burkhardt el 20 de enero de 2022

Procesamiento del lenguaje natural

Entendiendo el lenguaje

Funciones

FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Datos

setwd("~/ea9am")
iatalkmichael <- readLines ("iatalkmichael.txt")
head(iatalkmichael)

## [1] "[Music]" ""        "[Music]" ""        "[Music]" ""

Conteo de lineas

# Longitud de vector 
intLineCount <- length(iatalkmichael)
intLineCount

## [1] 1700

Palabras por linea

# separar
lstUNPrfLines <- str_split(iatalkmichael," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea 
mean(vciUNPrfWperL)

## [1] 3.084118

Conteo de palabras

# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir 
intWordCount

## [1] 5243

Mostrar palabras

head(vcsUNPrfWords,100)

##   [1] "[Music]"      ""             "[Music]"      ""             "[Music]"     
##   [6] ""             "[Music]"      ""             "artificial"   "response"    
##  [11] ""             "[Music]"      ""             "[Music]"      ""            
##  [16] "[Music]"      ""             "good"         "morning"      "everyone"    
##  [21] "i'm"          "buenos"       "dias"         "hi"           ""            
##  [26] "my"           "name"         "is"           "michael"      ""            
##  [31] "i'm"          "very"         "happy"        "to"           "be"          
##  [36] "here"         "and"          ""             "share"        ""            
##  [41] "a"            "bit"          "about"        ""             "my"          
##  [46] "journey"      "and"          ""             "how"          "it"          
##  [51] "all"          "started"      "basically"    "in"           "mexico"      
##  [56] ""             "but"          "i"            "will"         "talk"        
##  [61] "about"        "that"         "later"        ""             "and"         
##  [66] "yeah"         "i'm"          "very"         "glad"         "to"          
##  [71] "be"           "here"         ""             "um"           "i"           
##  [76] "want"         "to"           "start"        "with"         "a"           
##  [81] "little"       ""             "introduction" "into"         "myself"      
##  [86] ""             "which"        "you"          "know"         "gives"       
##  [91] "you"          "a"            "better"       "idea"         ""            
##  [96] "of"           "from"         ""             "from"         "what"

Limpieza de palabras

# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)

##   [1] "music"          "music"          "music"          "music"         
##   [5] "artificial"     "response"       "music"          "music"         
##   [9] "music"          "good"           "morning"        "everyone"      
##  [13] "im"             "buenos"         "dias"           "hi"            
##  [17] "my"             "name"           "is"             "michael"       
##  [21] "im"             "very"           "happy"          "to"            
##  [25] "be"             "here"           "and"            "share"         
##  [29] "a"              "bit"            "about"          "my"            
##  [33] "journey"        "and"            "how"            "it"            
##  [37] "all"            "started"        "basically"      "in"            
##  [41] "mexico"         "but"            "i"              "will"          
##  [45] "talk"           "about"          "that"           "later"         
##  [49] "and"            "yeah"           "im"             "very"          
##  [53] "glad"           "to"             "be"             "here"          
##  [57] "um"             "i"              "want"           "to"            
##  [61] "start"          "with"           "a"              "little"        
##  [65] "introduction"   "into"           "myself"         "which"         
##  [69] "you"            "know"           "gives"          "you"           
##  [73] "a"              "better"         "idea"           "of"            
##  [77] "from"           "from"           "what"           "perspective"   
##  [81] "and"            "background"     "im"             "coming"        
##  [85] "so"             "i"              "did"            "a"             
##  [89] "bachelor"       "in"             "a"              "business"      
##  [93] "administration" "in"             "germany"        "im"            
##  [97] "also"           "from"           "germany"        "originally"

Data frame de palabras normales

# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)

##         Words
## 1       music
## 2       music
## 3       music
## 4       music
## 5  artificial
## 6    response
## 7       music
## 8       music
## 9       music
## 10       good

Conteo de palabras “normales”

# resumiendo los datos 
dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 and     257
## 2 the     162
## 3 you     156
## 4 to      117
## 5 i       108
## 6 um      105

Nube de palabras normales

# nube de palabras 
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Data frame de palabras realmente significantes

En esta sección quitaremos las “stop words”

# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)

# remover las "stop words" o palabras comunes como conjunciones 

vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar", "the", "and", "um", "you", "that", "what", "music", "maybe", "they", "not", "have", "can", "so", "because", "was", "to", "so", "my", "it", "know", "of", "a", "for", "more", "where", "things", "very", "will", "but", "thats", "which", "with", "there", "also", "yeah", "from", "really", "its", "about", "are", "right", "then", "when", "all", "theres", "this", "just", "how", "many", "say", "something", "has", "now", "your", "always", "basically", "bit", "who", "first", "put", "little", "like", "would", "happening", "much", "end", "dont", "did", "into", "who", "lets", "had", "okay", "one", "those", "way", "want", "might", "going", "certain", "see", "even", "were", "trust", "here")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))

# remover las palabras no significativas para este contexto 
vcsBadWords <- c("decir","muy","están")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)

##        Words
## 1 artificial
## 2   response
## 3       good
## 4    morning
## 5   everyone
## 6     buenos

Conteo de palabras significativas

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words     Freq
##   <chr>    <int>
## 1 people      29
## 2 projects    16
## 3 doing       13
## 4 project     13
## 5 think       13
## 6 been        12

“cola” de palabras significativas

tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words    Freq
##   <chr>   <int>
## 1 within      1
## 2 without     1
## 3 wouldnt     1
## 4 writing     1
## 5 yes         1
## 6 zone        1

Eliminar palabras dispersas

# palabras con una frecuencia absoluta menor a 5 
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words     Freq
##   <chr>    <int>
## 1 started      6
## 2 super        6
## 3 talk         6
## 4 wanted       6
## 5 world        6
## 6 yourself     6

Conteo final de palabras

# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal

## [1] 50

Categorización por frecuencias

# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ... 
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
# 
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)

## # A tibble: 3 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    42
## 2 "     20"     7
## 3 "     50"     1

Nueva nube de palabras

wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Gráfica de barras de palabras

ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frequency") +
    xlab("Words") +
    ggtitle("Primeras 30 palabras con mayor frecuencia") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Gráfica de frecuencia

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Longitud de palabras

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")

A1U1

Alejandro Gil Aguilar

30/1/2022

Análisis de conteo de palabras para conocer su frecuencia

Video de youtube de la charla

Procesamiento del lenguaje natural

Funciones

Datos

Conteo de lineas

Palabras por linea

Conteo de palabras

Mostrar palabras

Limpieza de palabras

Data frame de palabras normales

Conteo de palabras “normales”

Nube de palabras normales

Data frame de palabras realmente significantes

Conteo de palabras significativas

“cola” de palabras significativas

Eliminar palabras dispersas

Conteo final de palabras

Categorización por frecuencias

Nueva nube de palabras

Gráfica de barras de palabras

Gráfica de frecuencia

Longitud de palabras