library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr", "xfun")
embed_url("https://youtu.be/Fzq0EQ2m6EQ")
Este es un ejercicio de la materia de probabilidad y estadística de la clase de las 9:00 a.m. (LMV) del departamento de matemáticas de ITSON.
El presente documento tiene como objetivo analizar a través de un conteo de palabras y análisis de frecuencia las charlas.
Para este caso se hace el análisis de la charla “Redefiniendo retos en oportunidades con IA” impartida por Michael Burkhardt el 20 de enero de 2022
Entendiendo el lenguaje
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory
}
setwd("~/ea9am")
iatalkmichael <- readLines ("iatalkmichael.txt")
head(iatalkmichael)
## [1] "[Music]" "" "[Music]" "" "[Music]" ""
# Longitud de vector
intLineCount <- length(iatalkmichael)
intLineCount
## [1] 1700
# separar
lstUNPrfLines <- str_split(iatalkmichael," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea
mean(vciUNPrfWperL)
## [1] 3.084118
# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir
intWordCount
## [1] 5243
head(vcsUNPrfWords,100)
## [1] "[Music]" "" "[Music]" "" "[Music]"
## [6] "" "[Music]" "" "artificial" "response"
## [11] "" "[Music]" "" "[Music]" ""
## [16] "[Music]" "" "good" "morning" "everyone"
## [21] "i'm" "buenos" "dias" "hi" ""
## [26] "my" "name" "is" "michael" ""
## [31] "i'm" "very" "happy" "to" "be"
## [36] "here" "and" "" "share" ""
## [41] "a" "bit" "about" "" "my"
## [46] "journey" "and" "" "how" "it"
## [51] "all" "started" "basically" "in" "mexico"
## [56] "" "but" "i" "will" "talk"
## [61] "about" "that" "later" "" "and"
## [66] "yeah" "i'm" "very" "glad" "to"
## [71] "be" "here" "" "um" "i"
## [76] "want" "to" "start" "with" "a"
## [81] "little" "" "introduction" "into" "myself"
## [86] "" "which" "you" "know" "gives"
## [91] "you" "a" "better" "idea" ""
## [96] "of" "from" "" "from" "what"
# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
## [1] "music" "music" "music" "music"
## [5] "artificial" "response" "music" "music"
## [9] "music" "good" "morning" "everyone"
## [13] "im" "buenos" "dias" "hi"
## [17] "my" "name" "is" "michael"
## [21] "im" "very" "happy" "to"
## [25] "be" "here" "and" "share"
## [29] "a" "bit" "about" "my"
## [33] "journey" "and" "how" "it"
## [37] "all" "started" "basically" "in"
## [41] "mexico" "but" "i" "will"
## [45] "talk" "about" "that" "later"
## [49] "and" "yeah" "im" "very"
## [53] "glad" "to" "be" "here"
## [57] "um" "i" "want" "to"
## [61] "start" "with" "a" "little"
## [65] "introduction" "into" "myself" "which"
## [69] "you" "know" "gives" "you"
## [73] "a" "better" "idea" "of"
## [77] "from" "from" "what" "perspective"
## [81] "and" "background" "im" "coming"
## [85] "so" "i" "did" "a"
## [89] "bachelor" "in" "a" "business"
## [93] "administration" "in" "germany" "im"
## [97] "also" "from" "germany" "originally"
# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)
## Words
## 1 music
## 2 music
## 3 music
## 4 music
## 5 artificial
## 6 response
## 7 music
## 8 music
## 9 music
## 10 good
# resumiendo los datos
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 and 257
## 2 the 162
## 3 you 156
## 4 to 117
## 5 i 108
## 6 um 105
# nube de palabras
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
# remover las "stop words" o palabras comunes como conjunciones
vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar", "the", "and", "um", "you", "that", "what", "music", "maybe", "they", "not", "have", "can", "so", "because", "was", "to", "so", "my", "it", "know", "of", "a", "for", "more", "where", "things", "very", "will", "but", "thats", "which", "with", "there", "also", "yeah", "from", "really", "its", "about", "are", "right", "then", "when", "all", "theres", "this", "just", "how", "many", "say", "something", "has", "now", "your", "always", "basically", "bit", "who", "first", "put", "little", "like", "would", "happening", "much", "end", "dont", "did", "into", "who", "lets", "had", "okay", "one", "those", "way", "want", "might", "going", "certain", "see", "even", "were", "trust", "here")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
# remover las palabras no significativas para este contexto
vcsBadWords <- c("decir","muy","están")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
## Words
## 1 artificial
## 2 response
## 3 good
## 4 morning
## 5 everyone
## 6 buenos
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 people 29
## 2 projects 16
## 3 doing 13
## 4 project 13
## 5 think 13
## 6 been 12
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 within 1
## 2 without 1
## 3 wouldnt 1
## 4 writing 1
## 5 yes 1
## 6 zone 1
# palabras con una frecuencia absoluta menor a 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 started 6
## 2 super 6
## 3 talk 6
## 4 wanted 6
## 5 world 6
## 6 yourself 6
# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 50
# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ...
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
#
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 3 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 42
## 2 " 20" 7
## 3 " 50" 1
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frequency") +
xlab("Words") +
ggtitle("Primeras 30 palabras con mayor frecuencia") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")