A1U1

Jorge Andrés Figueroa Márquez

30/1/2022

Análisis de palabras de IA Talks

El presente documento tiene como objetivo analizar a través de un conteo de palabras y anális de frecuencia las charlas.

Para este caso se hace el análisis de la charla “IA Talk Michael Burkhardt”

Inteligencia Artificial

Paquetes

library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr", "xfun")

Video de youtube de la charla

embed_url("https://www.youtube.com/watch?v=Fzq0EQ2m6EQ")

Funciones

FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Datos del texto

setwd("~/ea9am")
talk <- readLines("IA talk.txt")
head(talk)

## [1] "[Music]" ""        "[Music]" ""        "[Music]" ""

Conteo de lineas

# Longitud de vector 
intLineCount <- length(talk)
intLineCount

## [1] 1700

Palabras por linea

lstUNPrfLines <- str_split(talk," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea 
mean(vciUNPrfWperL)

## [1] 3.084118

Conteo de palabras

# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir 
intWordCount

## [1] 5243

Mostrar palabras

head(vcsUNPrfWords,100)

##   [1] "[Music]"      ""             "[Music]"      ""             "[Music]"     
##   [6] ""             "[Music]"      ""             "artificial"   "response"    
##  [11] ""             "[Music]"      ""             "[Music]"      ""            
##  [16] "[Music]"      ""             "good"         "morning"      "everyone"    
##  [21] "i'm"          "buenos"       "dias"         "hi"           ""            
##  [26] "my"           "name"         "is"           "michael"      ""            
##  [31] "i'm"          "very"         "happy"        "to"           "be"          
##  [36] "here"         "and"          ""             "share"        ""            
##  [41] "a"            "bit"          "about"        ""             "my"          
##  [46] "journey"      "and"          ""             "how"          "it"          
##  [51] "all"          "started"      "basically"    "in"           "mexico"      
##  [56] ""             "but"          "i"            "will"         "talk"        
##  [61] "about"        "that"         "later"        ""             "and"         
##  [66] "yeah"         "i'm"          "very"         "glad"         "to"          
##  [71] "be"           "here"         ""             "um"           "i"           
##  [76] "want"         "to"           "start"        "with"         "a"           
##  [81] "little"       ""             "introduction" "into"         "myself"      
##  [86] ""             "which"        "you"          "know"         "gives"       
##  [91] "you"          "a"            "better"       "idea"         ""            
##  [96] "of"           "from"         ""             "from"         "what"

Limpieza de palabras

# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)

##   [1] "music"          "music"          "music"          "music"         
##   [5] "artificial"     "response"       "music"          "music"         
##   [9] "music"          "good"           "morning"        "everyone"      
##  [13] "im"             "buenos"         "dias"           "hi"            
##  [17] "my"             "name"           "is"             "michael"       
##  [21] "im"             "very"           "happy"          "to"            
##  [25] "be"             "here"           "and"            "share"         
##  [29] "a"              "bit"            "about"          "my"            
##  [33] "journey"        "and"            "how"            "it"            
##  [37] "all"            "started"        "basically"      "in"            
##  [41] "mexico"         "but"            "i"              "will"          
##  [45] "talk"           "about"          "that"           "later"         
##  [49] "and"            "yeah"           "im"             "very"          
##  [53] "glad"           "to"             "be"             "here"          
##  [57] "um"             "i"              "want"           "to"            
##  [61] "start"          "with"           "a"              "little"        
##  [65] "introduction"   "into"           "myself"         "which"         
##  [69] "you"            "know"           "gives"          "you"           
##  [73] "a"              "better"         "idea"           "of"            
##  [77] "from"           "from"           "what"           "perspective"   
##  [81] "and"            "background"     "im"             "coming"        
##  [85] "so"             "i"              "did"            "a"             
##  [89] "bachelor"       "in"             "a"              "business"      
##  [93] "administration" "in"             "germany"        "im"            
##  [97] "also"           "from"           "germany"        "originally"

Data frame de palabras normales

# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)

##         Words
## 1       music
## 2       music
## 3       music
## 4       music
## 5  artificial
## 6    response
## 7       music
## 8       music
## 9       music
## 10       good

Conteo de palabras “normales”

# resumiendo los datos 
dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 and     257
## 2 the     162
## 3 you     156
## 4 to      117
## 5 i       108
## 6 um      105

Nube de palabras normales

wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Data frame de palabras realmente significantes

En esta sección se quitan las “stop words”

# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)

# remover las "stop words" o palabras comunes como conjunciones 

vcsCmnWords <- c("and", "you", "music", "that", "to", "for", "very", "happy", "all", "but", "the", "its", "was", "are", "then", "yeah", "how", "can", "not", "has", "your", "one")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))

# remover las palabras no significativas para este contexto 
vcsBadWords <- c("did","from","with", "morning", "buenos", "dias","name", "what", "about", "this", "right")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)

##        Words
## 1 artificial
## 2   response
## 3       good
## 4   everyone
## 5    michael
## 6       here

Conteo de palabras significativas

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words   Freq
##   <chr>  <int>
## 1 have      34
## 2 know      34
## 3 more      34
## 4 just      29
## 5 people    29
## 6 thats     29

“Cola” de palabras significativas

tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words    Freq
##   <chr>   <int>
## 1 within      1
## 2 without     1
## 3 wouldnt     1
## 4 writing     1
## 5 yes         1
## 6 zone        1

Eliminar palanras dispersas

# palabras con una frecuencia absoluta menor a 5 
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words     Freq
##   <chr>    <int>
## 1 super        6
## 2 talk         6
## 3 trust        6
## 4 wanted       6
## 5 world        6
## 6 yourself     6

Conteo final de palabras

# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal

## [1] 99

Categorización por frecuencias

# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ... 
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
# 
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)

## # A tibble: 3 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    67
## 2 "     20"    22
## 3 "     50"    10

Nueva nube de palabras

wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Gráfica de barras de palabras

ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frequency") +
    xlab("Words") +
    ggtitle("Primeras 30 palabras con mayor frecuencia") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Gráfica de frecuencia

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Longitud de palabras

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")

Descargas

Código

xfun::embed_file("A1U1.Rmd")

Download A1U1.Rmd

Datos

xfun::embed_file("IA Talk.txt")

Download IA Talk.txt