R Packages
# Pacotes e Funções
library(tidyverse) # Manipulacao eficiente de dados
library(tidytext) # Manipulacao eficiente de texto
library(textreadr) # Leitura de pdf para texto
library(tm) # Pacote de mineracao de texto com stopwords
library(wordcloud) # Grafico nuvem de palavras
library(igraph)
library(ggraph)
library(ggplot2)
library(dplyr)
library(pdftools)
library(RRPP)Text Normalization Function
# Função para normalizar texto
NormalizaParaTextMining <- function(texto){
# Normaliza texto
texto %>%
chartr(
old = "áéíóúÁÉÍÓÚýÝàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛãõÃÕñÑäëïöüÄËÏÖÜÿçÇ´`^~¨:.!?&$@#0123456789",
new = "aeiouAEIOUyYaeiouAEIOUaeiouAEIOUaoAOnNaeiouAEIOUycC ",
x = .) %>% # Elimina acentos e caracteres desnecessarios
str_squish() %>% # Elimina espacos excedentes
tolower() %>% # Converte para minusculo
return() # Retorno da funcao
}Text Cleaning Function
# Função para Palavras que podem se retiradas
# Lista de palavras para remover
palavrasRemover <- c(stopwords(kind = "pt"), letters) %>%
as.tibble() %>%
rename(Palavra = value) %>%
mutate(Palavra = NormalizaParaTextMining(Palavra))## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
Read pdf
Tidynd text
Cleaning useless words for single word analysis
CleanW <- TidyT %>%
unnest_tokens(Palavra, text) %>%
mutate(Palavra = NormalizaParaTextMining(Palavra)) %>%
anti_join(palavrasRemover)## Joining, by = "Palavra"
Counting single words
frequenciaPalavras <- CleanW %>%
count(Palavra, sort = TRUE) %>%
filter(Palavra != "") %>%
arrange(desc(n))
# Visualiza frequencia de palavras
DT::datatable(frequenciaPalavras)head(frequenciaPalavras, n=15) %>%
ggplot(aes(Palavra, n)) +
geom_bar(stat = "identity", color = "black", fill = "#87CEFA") +
geom_text(aes(hjust = 1.3, label = n)) +
coord_flip() +
labs(title = "20 Palavras mais mencionadas", x = "Palavras", y = "Número de usos")barplot(frequenciaPalavras[1:15,1:2]$n,
names.arg=frequenciaPalavras[1:15,1:2]$Palavra,
ylab="Quantidade",
ylim = c(40, 100),
las = 2,
col=rainbow(8),
main="Top 15")Analysing pairwise of words
Cleaning useless words and tidyng data for pairwise word analysis
# remove "www" "http"
my_stopwords <- tibble(Palavra = c(as.character(1:3),
"www","http", ",", "de","os","nao","da","sao","um","dos","em","para","possui","sobre","das","na","uma","se","não", "outro", "entao", "existem", "outros", "aos", "quando", "seus", "seu", "nas", "na","evitar", "possuem", "ou", "sua", "então", "podem", "esta", "bem", "ha", "há","forem", "documentadas", "documentados", "alta"))
par_Palavras <- TidyT %>%
unnest_tokens(Palavra, text, token = "ngrams", n = 2) %>%
mutate(Palavra = na.omit(NormalizaParaTextMining(Palavra))) %>%
anti_join(palavrasRemover) %>%
anti_join(my_stopwords) %>%
separate(Palavra, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% my_stopwords$Palavra) %>%
filter(!word2 %in% my_stopwords$Palavra) %>%
count(word1, word2, sort = TRUE)## Joining, by = "Palavra"Joining, by = "Palavra"
## Function to show word network
.Par_net = function(z) {
par_Palavras %>%
filter(n >= z) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 4) +
geom_node_text(color = "red", aes(label = name), vjust = 1.8, size=3) +
labs(title= "Word graph - HF2 Framework",
subtitle = paste("Pairwise analysis words >= ", z, " frequency",
x = "", y = ""))
}
par(mfrow=c(2,2))
.Par_net(10)trio_Palavras <- TidyT %>%
unnest_tokens(Palavra, text, token = "ngrams", n = 3) %>%
separate(Palavra, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word,
!word1 %in% my_stopwords$Palavra,
!word2 %in% my_stopwords$Palavra,
!word3 %in% my_stopwords$Palavra) %>%
count(word1, word2, word3, sort = TRUE)
DT::datatable(trio_Palavras)We may instead want to examine correlation among words, which indicates how often they appear together relative to how often they appear separately.
In particular, here we’ll focus on the phi coefficient, a common measure for binary correlation. The focus of the phi coefficient is how much more likely it is that either both word X and Y appear, or neither do, than that one appears without the other.
The phi coefficient is equivalent to the Pearson correlation, which you may have heard of elsewhere, when it is applied to binary data).
The pairwise_cor() function in widyr lets us find the phi coefficient between words based on how often they appear in the same section. Its syntax is similar to pairwise_count().
library(widyr)
# we need to filter for at least relatively common words first
word_cor <- par_Palavras %>%
group_by(word1) %>%
filter(n() >= 5) %>%
widyr::pairwise_cor(word1,word2, sort = TRUE)
word_cor2 <- par_Palavras %>%
group_by(word2) %>%
filter(n() >= 5) %>%
widyr::pairwise_cor(word1,word2, sort = TRUE)Filter_word <- c("trabalho", "condicoes", "organização","gestao","acoes", "riscos", "repertório", "competência", "conhecimento", "procedimentos", "desing" , "habilidades", "ambiente", "segurança", "técnicas" )
.Dendogram <- function(x,y,z) {
x %>%
filter(item1 %in% y) %>%
group_by(item1) %>%
arrange(desc(item1))%>%
top_n(z) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation, fill = item1)) +
geom_col(show.legend = FALSE) +
geom_bar(stat = "identity") +
labs(x=NULL, y= NULL)+
facet_wrap(~ item1, ncol = 2, scales = "free") +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered()
}
.Dendogram(word_cor, Filter_word,10)## Selecting by correlation
## Selecting by correlation
palavras=word_cor
wordnetwork <- head(palavras, 100)
wordnetwork <- graph_from_data_frame(wordnetwork)
wordnetwork2 <- as.undirected(wordnetwork)
comm <- cluster_fast_greedy(wordnetwork2, weights = E(wordnetwork2)$n)
plot_dendrogram(comm, main="Pairwise word clusters dendogram", cex=.9, hang=-1)## Warning in plot.window(...): "hang" não é um parâmetro gráfico
## Warning in plot.xy(xy, type, ...): "hang" não é um parâmetro gráfico
## Warning in title(...): "hang" não é um parâmetro gráfico
palavras=word_cor2
wordnetwork <- head(palavras, 40)
wordnetwork <- graph_from_data_frame(wordnetwork)
wordnetwork2 <- as.undirected(wordnetwork)
comm <- cluster_fast_greedy(wordnetwork2, weights = E(wordnetwork2)$n)
plot_dendrogram(comm, main="Pairwise word clusters dendogram", cex=.9, hang=-1)## Warning in plot.window(...): "hang" não é um parâmetro gráfico
## Warning in plot.xy(xy, type, ...): "hang" não é um parâmetro gráfico
## Warning in title(...): "hang" não é um parâmetro gráfico
word_cor1 <- par_Palavras %>%
group_by(word1) %>%
filter(n() >= 15) %>%
pairwise_cor(word1,word2, sort = TRUE)
word_cor1 %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()National Commission on the BP Deepwater Horizon Oil Spill and Offshore Drilling (NCDWHSOD). Deep Water: The Gulf Oil Disaster and the Future of Offshore Drilling. Report to the President. January 2011 Cover Photo: © Steadfast TV. ISBN: 978-0-16-087371-3. https://www.govinfo.gov/content/pkg/GPO-OILCOMMISSION/pdf/GPO-OILCOMMISSION.pdf
http://data7.blog/grafo-de-palavras-anitta-twitter/
Analise de palavras. Disponivel em: https://www.ufrgs.br/wiki-r/index.php?title=Frequ%C3%AAncia_das_palavras_e_nuvem_de_palavras Esta página foi modificada pela última vez em 12 de dezembro de 2018, às 19h30min Conteúdo disponível sob Creative Commons - Atribuição - Compartilha nos Mesmos Termos, salvo indicação em contrário.
https://p4husp.github.io/material/tutorial11/
Principal: https://www.tidytextmining.com/ngrams.html
Corpus and Machine Learning: https://rstudio-pubs-static.s3.amazonaws.com/265713_cbef910aee7642dc8b62996e38d2825d.html
Machine learning: https://kenbenoit.net/pdfs/text_analysis_in_R.pdf
Mineração de texto: https://www.rpubs.com/LaionBoaventura/mineracaodetexto