Se tiene como objetivo analizar a tráves de un conteo de palabras y análisis de frecuencia el panel de “Políticas Públicas y la Inteligencia Artificial”
Paquetes
library(pacman)
p_load("dplyr", "stringr", "ggplot2", "wordcloud","rmdformats","vembedr")
Video de youtube de la charla
embed_url("https://www.youtube.com/watch?v=gGd5_DKqcCU")
Funciones
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory
}
Recopilación de datos
Datos
panel <- readLines ("panel.txt", encoding = "UTF-8")
head(panel)
## [1] "ah"
## [2] ""
## [3] "hola que tal"
## [4] ""
## [5] "bienvenidos al tercer y último día de"
## [6] ""
Conteo de líneas
intLineCount <-length(panel)
intLineCount
## [1] 2569
Conteo de palabras por linea
lstUNPrfLines <- str_split(panel," ")
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
mean(vciUNPrfWperL)
## [1] 3.564033
Conteo de palabras
vcsUNPrfWords <- unlist(lstUNPrfLines)
intWordCount <- length(vcsUNPrfWords)
intWordCount
## [1] 9156
Mostrar palabras
head(vcsUNPrfWords,100)
## [1] "ah" "" "hola" "que"
## [5] "tal" "" "bienvenidos" "al"
## [9] "tercer" "y" "último" "día"
## [13] "de" "" "monterrey" "este"
## [17] "es" "un" "día" "muy"
## [21] "especial" "" "porque" "por"
## [25] "primera" "ocasión" "se" "introduce"
## [29] "" "un" "tema" "que"
## [33] "es" "muy" "importante" "para"
## [37] "todos" "" "no" "sólo"
## [41] "para" "la" "comunidad" "emprendedora"
## [45] "y" "" "relacionada" "con"
## [49] "temas" "de" "innovación" "sino"
## [53] "" "también" "con" "la"
## [57] "ciudad" "con" "las" ""
## [61] "comunidades" "y" "con" "el"
## [65] "planeta" "que" "el" "día"
## [69] "" "de" "hoy" "vamos"
## [73] "a" "ver" "el" "tema"
## [77] "de" "live" "human" ""
## [81] "being" "que" "tiene" "que"
## [85] "ver" "con" "los" "temas"
## [89] "de" "" "sostenibilidad" "que"
## [93] "son" "muy" "importantes" ""
## [97] "en" "las" "ciudades" "este"
Limpieza de palabras
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
head(vcsUNPrfWords,100)
## [1] "ah" "hola" "que" "tal"
## [5] "bienvenidos" "al" "tercer" "y"
## [9] "último" "día" "de" "monterrey"
## [13] "este" "es" "un" "día"
## [17] "muy" "especial" "porque" "por"
## [21] "primera" "ocasión" "se" "introduce"
## [25] "un" "tema" "que" "es"
## [29] "muy" "importante" "para" "todos"
## [33] "no" "sólo" "para" "la"
## [37] "comunidad" "emprendedora" "y" "relacionada"
## [41] "con" "temas" "de" "innovación"
## [45] "sino" "también" "con" "la"
## [49] "ciudad" "con" "las" "comunidades"
## [53] "y" "con" "el" "planeta"
## [57] "que" "el" "día" "de"
## [61] "hoy" "vamos" "a" "ver"
## [65] "el" "tema" "de" "live"
## [69] "human" "being" "que" "tiene"
## [73] "que" "ver" "con" "los"
## [77] "temas" "de" "sostenibilidad" "que"
## [81] "son" "muy" "importantes" "en"
## [85] "las" "ciudades" "este" "en"
## [89] "el" "desarrollo" "de" "la"
## [93] "movilidad" "en" "los" "temas"
## [97] "del" "uso" "de" "tecnología"
Data frame de palabras generales
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
head(dfrUNPrfWords,10)
## Words
## 1 ah
## 2 hola
## 3 que
## 4 tal
## 5 bienvenidos
## 6 al
## 7 tercer
## 8 y
## 9 último
## 10 día
Uso de palabras generales
Conteo de palabras generales
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 de 477
## 2 que 379
## 3 la 275
## 4 y 243
## 5 en 211
## 6 el 170
Nube de palabras generales
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Uso de palabras significativas
Data frame de palabras realmente significativas
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
vcsCmnWords <- c("de","que","en","y","la","a","el","es","una","un","pues","no","para","los","se","las","como","con","más","por","lo","hay","del","o","entonces","este","está","nos","pero","también","creo","porque","también","yo","ya","esta","si","me","al","son","tiene","donde","bueno","ha","sobre","ejemplo","bien","gracias","ser","eso","todo","uso","ver","tener","esto","estos","muchas","cómo","cuando","sea","tenemos","su","tienen","así","desde","han","parte","ahí","les","tal","qué","estar")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
vcsBadWords <- c("decir","muy","están", "buenas", "tardes", "todos", "todas", "vamos", "puede", "podemos", "estamos", "vamos", "queremos", "algo", "ese", "nosotros", "quizás", "sus", "veces", "vez", "esa", "ella")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
head(dfrUNPrfWords)
## Words
## 1 hola
## 2 bienvenidos
## 3 tercer
## 4 último
## 5 día
## 6 monterrey
Conteo de palabras significativas
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 inteligencia 49
## 2 artificial 45
## 3 caso 35
## 4 gobierno 28
## 5 justamente 27
## 6 precisamente 25
Cola del vector de las palabras significativas
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 vincula 1
## 2 visualización 1
## 3 viviendo 1
## 4 vuelve 1
## 5 word 1
## 6 zona 1
Eliminar palabras dispersas
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 primero 6
## 2 rol 6
## 3 sistema 6
## 4 sólo 6
## 5 teniendo 6
## 6 vida 6
Conteo final de palabras
intWordCountFinal <- length(dfrUNPrfFreq$Words)
intWordCountFinal
## [1] 132
Categorización por frecuencias
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
head(dfrUNPrfFocf,10)
## # A tibble: 3 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 89
## 2 " 20" 36
## 3 " 50" 7
Nube de palabras significativas
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Gráfica de barras de palabras
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frequency") +
xlab("Words") +
ggtitle("Primeras 30 palabras con mayor frecuencia") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
Gráfica de frecuencia
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
Longitud de palabras
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")