Análisis de palabras de IA Talks
- El presente documento tiene como objetivo analizar a través de un conteo de palabras y anális de frecuencia las charlas.
Para este caso se hace el análisis de la charla “IA Talk Michael Burkhardt”
Inteligencia Artificial
Paquetes
library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr", "xfun")
Video de youtube de la charla
embed_url("https://www.youtube.com/watch?v=Fzq0EQ2m6EQ")
Funciones
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory
}
Datos del texto
setwd("~/ea9am")
talk <- readLines("IA talk.txt")
head(talk)
## [1] "[Music]" "" "[Music]" "" "[Music]" ""
Conteo de lineas
# Longitud de vector
intLineCount <- length(talk)
intLineCount
## [1] 1700
Palabras por linea
lstUNPrfLines <- str_split(talk," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea
mean(vciUNPrfWperL)
## [1] 3.084118
Conteo de palabras
# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir
intWordCount
## [1] 5243
Mostrar palabras
head(vcsUNPrfWords,100)
## [1] "[Music]" "" "[Music]" "" "[Music]"
## [6] "" "[Music]" "" "artificial" "response"
## [11] "" "[Music]" "" "[Music]" ""
## [16] "[Music]" "" "good" "morning" "everyone"
## [21] "i'm" "buenos" "dias" "hi" ""
## [26] "my" "name" "is" "michael" ""
## [31] "i'm" "very" "happy" "to" "be"
## [36] "here" "and" "" "share" ""
## [41] "a" "bit" "about" "" "my"
## [46] "journey" "and" "" "how" "it"
## [51] "all" "started" "basically" "in" "mexico"
## [56] "" "but" "i" "will" "talk"
## [61] "about" "that" "later" "" "and"
## [66] "yeah" "i'm" "very" "glad" "to"
## [71] "be" "here" "" "um" "i"
## [76] "want" "to" "start" "with" "a"
## [81] "little" "" "introduction" "into" "myself"
## [86] "" "which" "you" "know" "gives"
## [91] "you" "a" "better" "idea" ""
## [96] "of" "from" "" "from" "what"
Limpieza de palabras
# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
## [1] "music" "music" "music" "music"
## [5] "artificial" "response" "music" "music"
## [9] "music" "good" "morning" "everyone"
## [13] "im" "buenos" "dias" "hi"
## [17] "my" "name" "is" "michael"
## [21] "im" "very" "happy" "to"
## [25] "be" "here" "and" "share"
## [29] "a" "bit" "about" "my"
## [33] "journey" "and" "how" "it"
## [37] "all" "started" "basically" "in"
## [41] "mexico" "but" "i" "will"
## [45] "talk" "about" "that" "later"
## [49] "and" "yeah" "im" "very"
## [53] "glad" "to" "be" "here"
## [57] "um" "i" "want" "to"
## [61] "start" "with" "a" "little"
## [65] "introduction" "into" "myself" "which"
## [69] "you" "know" "gives" "you"
## [73] "a" "better" "idea" "of"
## [77] "from" "from" "what" "perspective"
## [81] "and" "background" "im" "coming"
## [85] "so" "i" "did" "a"
## [89] "bachelor" "in" "a" "business"
## [93] "administration" "in" "germany" "im"
## [97] "also" "from" "germany" "originally"
Data frame de palabras normales
# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)
## Words
## 1 music
## 2 music
## 3 music
## 4 music
## 5 artificial
## 6 response
## 7 music
## 8 music
## 9 music
## 10 good
Conteo de palabras “normales”
# resumiendo los datos
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 and 257
## 2 the 162
## 3 you 156
## 4 to 117
## 5 i 108
## 6 um 105
Nube de palabras normales
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Data frame de palabras realmente significantes
En esta sección se quitan las “stop words”
# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
# remover las "stop words" o palabras comunes como conjunciones
vcsCmnWords <- c("and", "you", "music", "that", "to", "for", "very", "happy", "all", "but", "the", "its", "was", "are", "then", "yeah", "how", "can", "not", "has", "your", "one")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
# remover las palabras no significativas para este contexto
vcsBadWords <- c("did","from","with", "morning", "buenos", "dias","name", "what", "about", "this", "right")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
## Words
## 1 artificial
## 2 response
## 3 good
## 4 everyone
## 5 michael
## 6 here
Conteo de palabras significativas
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 have 34
## 2 know 34
## 3 more 34
## 4 just 29
## 5 people 29
## 6 thats 29
“Cola” de palabras significativas
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 within 1
## 2 without 1
## 3 wouldnt 1
## 4 writing 1
## 5 yes 1
## 6 zone 1
Eliminar palanras dispersas
# palabras con una frecuencia absoluta menor a 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 super 6
## 2 talk 6
## 3 trust 6
## 4 wanted 6
## 5 world 6
## 6 yourself 6
Conteo final de palabras
# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 99
Categorización por frecuencias
# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ...
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
#
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 3 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 67
## 2 " 20" 22
## 3 " 50" 10
Nueva nube de palabras
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Gráfica de barras de palabras
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frequency") +
xlab("Words") +
ggtitle("Primeras 30 palabras con mayor frecuencia") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
Gráfica de frecuencia
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
Longitud de palabras
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")