A1U1

José Angulo

1/31/2022

Se tiene como objetivo analizar a tráves de un conteo de palabras y análisis de frecuencia el panel de “Políticas Públicas y la Inteligencia Artificial”

Paquetes

library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr")

Video de youtube de la charla

embed_url("https://www.youtube.com/watch?v=gGd5_DKqcCU")

Funciones

FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Recopilación de datos

Datos

panel <- readLines ("panel.txt", encoding = "UTF-8")
head(panel)
## [1] "ah"                                   
## [2] ""                                     
## [3] "hola que tal"                         
## [4] ""                                     
## [5] "bienvenidos al tercer y último día de"
## [6] ""

Conteo de líneas

intLineCount <-length(panel)
intLineCount
## [1] 2569

Conteo de palabras por linea

lstUNPrfLines <- str_split(panel," ")
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
mean(vciUNPrfWperL)
## [1] 3.564033

Conteo de palabras

vcsUNPrfWords <- unlist(lstUNPrfLines)
intWordCount <- length(vcsUNPrfWords)
intWordCount
## [1] 9156

Mostrar palabras

head(vcsUNPrfWords,100)
##   [1] "ah"             ""               "hola"           "que"           
##   [5] "tal"            ""               "bienvenidos"    "al"            
##   [9] "tercer"         "y"              "último"         "día"           
##  [13] "de"             ""               "monterrey"      "este"          
##  [17] "es"             "un"             "día"            "muy"           
##  [21] "especial"       ""               "porque"         "por"           
##  [25] "primera"        "ocasión"        "se"             "introduce"     
##  [29] ""               "un"             "tema"           "que"           
##  [33] "es"             "muy"            "importante"     "para"          
##  [37] "todos"          ""               "no"             "sólo"          
##  [41] "para"           "la"             "comunidad"      "emprendedora"  
##  [45] "y"              ""               "relacionada"    "con"           
##  [49] "temas"          "de"             "innovación"     "sino"          
##  [53] ""               "también"        "con"            "la"            
##  [57] "ciudad"         "con"            "las"            ""              
##  [61] "comunidades"    "y"              "con"            "el"            
##  [65] "planeta"        "que"            "el"             "día"           
##  [69] ""               "de"             "hoy"            "vamos"         
##  [73] "a"              "ver"            "el"             "tema"          
##  [77] "de"             "live"           "human"          ""              
##  [81] "being"          "que"            "tiene"          "que"           
##  [85] "ver"            "con"            "los"            "temas"         
##  [89] "de"             ""               "sostenibilidad" "que"           
##  [93] "son"            "muy"            "importantes"    ""              
##  [97] "en"             "las"            "ciudades"       "este"

Limpieza de palabras

vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
head(vcsUNPrfWords,100)
##   [1] "ah"             "hola"           "que"            "tal"           
##   [5] "bienvenidos"    "al"             "tercer"         "y"             
##   [9] "último"         "día"            "de"             "monterrey"     
##  [13] "este"           "es"             "un"             "día"           
##  [17] "muy"            "especial"       "porque"         "por"           
##  [21] "primera"        "ocasión"        "se"             "introduce"     
##  [25] "un"             "tema"           "que"            "es"            
##  [29] "muy"            "importante"     "para"           "todos"         
##  [33] "no"             "sólo"           "para"           "la"            
##  [37] "comunidad"      "emprendedora"   "y"              "relacionada"   
##  [41] "con"            "temas"          "de"             "innovación"    
##  [45] "sino"           "también"        "con"            "la"            
##  [49] "ciudad"         "con"            "las"            "comunidades"   
##  [53] "y"              "con"            "el"             "planeta"       
##  [57] "que"            "el"             "día"            "de"            
##  [61] "hoy"            "vamos"          "a"              "ver"           
##  [65] "el"             "tema"           "de"             "live"          
##  [69] "human"          "being"          "que"            "tiene"         
##  [73] "que"            "ver"            "con"            "los"           
##  [77] "temas"          "de"             "sostenibilidad" "que"           
##  [81] "son"            "muy"            "importantes"    "en"            
##  [85] "las"            "ciudades"       "este"           "en"            
##  [89] "el"             "desarrollo"     "de"             "la"            
##  [93] "movilidad"      "en"             "los"            "temas"         
##  [97] "del"            "uso"            "de"             "tecnología"

Data frame de palabras generales

dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
head(dfrUNPrfWords,10)
##          Words
## 1           ah
## 2         hola
## 3          que
## 4          tal
## 5  bienvenidos
## 6           al
## 7       tercer
## 8            y
## 9       último
## 10         día

Uso de palabras generales

Conteo de palabras generales

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 de      477
## 2 que     379
## 3 la      275
## 4 y       243
## 5 en      211
## 6 el      170

Nube de palabras generales

wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Uso de palabras significativas

Data frame de palabras realmente significativas

dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)

vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))

vcsBadWords <- c("decir","muy","están", "buenas", "tardes", "todos", "todas", "vamos", "puede", "podemos", "estamos", "vamos", "queremos", "algo", "ese", "nosotros", "quizás", "sus", "veces", "vez", "esa", "ella")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))

head(dfrUNPrfWords)
##         Words
## 1        hola
## 2 bienvenidos
## 3      tercer
## 4      último
## 5         día
## 6   monterrey

Conteo de palabras significativas

dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words         Freq
##   <chr>        <int>
## 1 inteligencia    49
## 2 artificial      45
## 3 caso            35
## 4 gobierno        28
## 5 justamente      27
## 6 precisamente    25

Cola del vector de las palabras significativas

tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words          Freq
##   <chr>         <int>
## 1 vincula           1
## 2 visualización     1
## 3 viviendo          1
## 4 vuelve            1
## 5 word              1
## 6 zona              1

Eliminar palabras dispersas

dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
##   Words     Freq
##   <chr>    <int>
## 1 primero      6
## 2 rol          6
## 3 sistema      6
## 4 sólo         6
## 5 teniendo     6
## 6 vida         6

Conteo final de palabras

intWordCountFinal <- length(dfrUNPrfFreq$Words)
intWordCountFinal
## [1] 132

Categorización por frecuencias

dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
head(dfrUNPrfFocf,10)
## # A tibble: 3 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    89
## 2 "     20"    36
## 3 "     50"     7

Nube de palabras significativas

wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Gráfica de barras de palabras

ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frequency") +
    xlab("Words") +
    ggtitle("Primeras 30 palabras con mayor frecuencia") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Gráfica de frecuencia

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Longitud de palabras

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")