A1U1

Daniel Acedo

31/1/2022

Paquetes

library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr", "xfun")

Video de youtube de la charla

embed_url("https://www.youtube.com/watch?v=gGd5_DKqcCU")

Funciones

FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Datos del texto

setwd("~/ea9am")
video <- readLines("video.txt")
head(video)
## [1] "[Música]" ""          "[Música]" ""          "[Música]" ""

Conteo de lineas

# Longitud de vector 
intLineCount <- length(video)
intLineCount
## [1] 2606

Palabras por linea

lstUNPrfLines <- str_split(video," ")
# palabras por linea
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# imprimir media de palabras por linea 
mean(vciUNPrfWperL)
## [1] 3.527629

Conteo de palabras

# deslistar para obtener un vector de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
# recuento total de palabras = longitud del vector
intWordCount <- length(vcsUNPrfWords)
# imprimir 
intWordCount
## [1] 9193

Mostrar palabras

head(vcsUNPrfWords,100)
##   [1] "[Música]"    ""             "[Música]"    ""             "[Música]"   
##   [6] ""             "[Música]"    ""             "a"            ""            
##  [11] "[Música]"    ""             "[Música]"    ""             "y"           
##  [16] ""             "y"            ""             "[Música]"    ""            
##  [21] "ah"           ""             "[Música]"    ""             "[Música]"   
##  [26] ""             "[Música]"    ""             "[Música]"    ""            
##  [31] "[Música]"    ""             "[Música]"    ""             "ah"          
##  [36] ""             "hola"         "que"          "tal"          ""            
##  [41] "bienvenidos"  "al"           "tercer"       "y"            "último"     
##  [46] "día"         "de"           ""             "monterrey"    "este"        
##  [51] "es"           "un"           "día"         "muy"          "especial"    
##  [56] ""             "porque"       "por"          "primera"      "ocasión"    
##  [61] "se"           "introduce"    ""             "un"           "tema"        
##  [66] "que"          "es"           "muy"          "importante"   "para"        
##  [71] "todos"        ""             "no"           "sólo"        "para"        
##  [76] "la"           "comunidad"    "emprendedora" "y"            ""            
##  [81] "relacionada"  "con"          "temas"        "de"           "innovación" 
##  [86] "sino"         ""             "también"     "con"          "la"          
##  [91] "ciudad"       "con"          "las"          ""             "comunidades" 
##  [96] "y"            "con"          "el"           "planeta"      "que"

Limpieza de palabras

# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
##   [1] "mãºsica"        "mãºsica"        "mãºsica"        "mãºsica"       
##   [5] "a"              "mãºsica"        "mãºsica"        "y"             
##   [9] "y"              "mãºsica"        "ah"             "mãºsica"       
##  [13] "mãºsica"        "mãºsica"        "mãºsica"        "mãºsica"       
##  [17] "mãºsica"        "ah"             "hola"           "que"           
##  [21] "tal"            "bienvenidos"    "al"             "tercer"        
##  [25] "y"              "ãºltimo"        "dã­a"           "de"            
##  [29] "monterrey"      "este"           "es"             "un"            
##  [33] "dã­a"           "muy"            "especial"       "porque"        
##  [37] "por"            "primera"        "ocasiã³n"       "se"            
##  [41] "introduce"      "un"             "tema"           "que"           
##  [45] "es"             "muy"            "importante"     "para"          
##  [49] "todos"          "no"             "sã³lo"          "para"          
##  [53] "la"             "comunidad"      "emprendedora"   "y"             
##  [57] "relacionada"    "con"            "temas"          "de"            
##  [61] "innovaciã³n"    "sino"           "tambiã©n"       "con"           
##  [65] "la"             "ciudad"         "con"            "las"           
##  [69] "comunidades"    "y"              "con"            "el"            
##  [73] "planeta"        "que"            "el"             "dã­a"          
##  [77] "de"             "hoy"            "vamos"          "a"             
##  [81] "ver"            "el"             "tema"           "de"            
##  [85] "live"           "human"          "being"          "que"           
##  [89] "tiene"          "que"            "ver"            "con"           
##  [93] "los"            "temas"          "de"             "sostenibilidad"
##  [97] "que"            "son"            "muy"            "importantes"

Data frame de palabras normales

# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)
##      Words
## 1  mãºsica
## 2  mãºsica
## 3  mãºsica
## 4  mãºsica
## 5        a
## 6  mãºsica
## 7  mãºsica
## 8        y
## 9        y
## 10 mãºsica

Conteo de palabras “normales”

# resumiendo los datos 
dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 de      477
## 2 que     379
## 3 la      275
## 4 y       245
## 5 en      211
## 6 el      170

Nube de palabras normales

wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Data frame de palabras realmente significantes

En esta sección se quitan las “stop words”

# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)

# remover las "stop words" o palabras comunes como conjunciones 

vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))

# remover las palabras no significativas para este contexto 
vcsBadWords <- c("decir","muy","están")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
##     Words
## 1 mãºsica
## 2 mãºsica
## 3 mãºsica
## 4 mãºsica
## 5 mãºsica
## 6 mãºsica

Conteo de palabras significativas

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words         Freq
##   <chr>        <int>
## 1 tambiã©n        67
## 2 inteligencia    49
## 3 artificial      45
## 4 caso            35
## 5 estamos         33
## 6 todos           29

“Cola” de palabras significativas

tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words           Freq
##   <chr>          <int>
## 1 vincula            1
## 2 visualizaciã³n     1
## 3 viviendo           1
## 4 vuelve             1
## 5 word               1
## 6 zona               1

Eliminar palabras dispersas

# palabras con una frecuencia absoluta menor a 5 
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words     Freq
##   <chr>    <int>
## 1 primero      6
## 2 rol          6
## 3 sã³lo        6
## 4 sistema      6
## 5 teniendo     6
## 6 vida         6

Conteo final de palabras

# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 152

Categorización por frecuencias

# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ... 
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
# 
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 4 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    95
## 2 "     20"    44
## 3 "     50"    12
## 4 "    100"     1

Nueva nube de palabras

wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Gráfica de barras de palabras

ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frequency") +
    xlab("Words") +
    ggtitle("Primeras 30 palabras con mayor frecuencia") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Gráfica de frecuencia

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Longitud de palabras

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")