R Packages

# Pacotes e Funções
library(tidyverse) # Manipulacao eficiente de dados
library(tidytext) # Manipulacao eficiente de texto
library(textreadr) # Leitura de pdf para texto
library(tm) # Pacote de mineracao de texto com stopwords 
library(wordcloud) # Grafico nuvem de palavras
library(igraph)
library(ggraph)
library(ggplot2)
library(dplyr)
library(pdftools)
library(RRPP)
setwd("~/Text Mining")

Text Normalization Function

# Função para normalizar texto
NormalizaParaTextMining <- function(texto){
 
  # Normaliza texto
  texto %>% 
    chartr(
      old = "áéíóúÁÉÍÓÚýÝàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛãõÃÕñÑäëïöüÄËÏÖÜÿçÇ´`^~¨:.!?&$@#0123456789",
      new = "aeiouAEIOUyYaeiouAEIOUaeiouAEIOUaoAOnNaeiouAEIOUycC                       ",
      x = .) %>% # Elimina acentos e caracteres desnecessarios
    str_squish() %>% # Elimina espacos excedentes 
    tolower() %>% # Converte para minusculo
    return() # Retorno da funcao
}

Text Cleaning Function

# Função para Palavras que podem se retiradas

# Lista de palavras para remover
palavrasRemover <- c(stopwords(kind = "pt"), letters) %>%
  as.tibble() %>% 
  rename(Palavra = value) %>% 
  mutate(Palavra = NormalizaParaTextMining(Palavra))
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.

Read pdf

# Arquivo pdf
arquivoPdf <- "~/Text Mining/Frameworkhf.pdf"

Tidynd text

TidyT <- arquivoPdf %>% 
  read_pdf() %>% 
  as.tibble() %>% 
  select(text) 

1 Unigrams analysis

Cleaning useless words for single word analysis

  CleanW <- TidyT  %>% 
  unnest_tokens(Palavra, text) %>% 
  mutate(Palavra = NormalizaParaTextMining(Palavra)) %>% 
  anti_join(palavrasRemover)
## Joining, by = "Palavra"

Counting single words

  frequenciaPalavras <-  CleanW  %>% 
  count(Palavra, sort = TRUE) %>% 
  filter(Palavra != "") %>%
  arrange(desc(n)) 


# Visualiza frequencia de palavras
DT::datatable(frequenciaPalavras)

1.1 Top 15 words

head(frequenciaPalavras, n=15) %>%
  ggplot(aes(Palavra, n)) +
  geom_bar(stat = "identity", color = "black", fill = "#87CEFA") +
  geom_text(aes(hjust = 1.3, label = n)) + 
  coord_flip() + 
  labs(title = "20 Palavras mais mencionadas",  x = "Palavras", y = "Número de usos")

barplot(frequenciaPalavras[1:15,1:2]$n,
        names.arg=frequenciaPalavras[1:15,1:2]$Palavra,
        ylab="Quantidade",
        ylim = c(40, 100),
        las = 2,
        col=rainbow(8),
        main="Top 15")

1.2 Word clouds

# Cria nuvem de palavras
wordcloud(
  words = frequenciaPalavras$Palavra, 
  freq = frequenciaPalavras$n,
  min.freq = 2,
  max.words = 300, 
  random.order = FALSE, 
  rot.per = 0.35, 
  colors = brewer.pal(8, "Dark2")
)

2 Bigrams analysis

Analysing pairwise of words

Cleaning useless words and tidyng data for pairwise word analysis

# remove "www" "http"
my_stopwords <- tibble(Palavra = c(as.character(1:3), 
                                    "www","http", ",", "de","os","nao","da","sao","um","dos","em","para","possui","sobre","das","na","uma","se","não", "outro", "entao", "existem", "outros", "aos", "quando", "seus", "seu", "nas", "na","evitar", "possuem", "ou", "sua", "então", "podem", "esta", "bem", "ha", "há","forem", "documentadas", "documentados", "alta"))

par_Palavras <- TidyT %>%
  unnest_tokens(Palavra, text, token = "ngrams", n = 2) %>% 
  mutate(Palavra = na.omit(NormalizaParaTextMining(Palavra))) %>%
  anti_join(palavrasRemover) %>%
  anti_join(my_stopwords) %>%
  separate(Palavra, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% my_stopwords$Palavra) %>%
  filter(!word2 %in% my_stopwords$Palavra) %>%
  count(word1, word2, sort = TRUE)
## Joining, by = "Palavra"Joining, by = "Palavra"
# remove NA
par_Palavras <- na.omit(par_Palavras)


DT::datatable(par_Palavras)

2.1 Word graph network

## Function to show word network
.Par_net = function(z) {
  par_Palavras %>%
  filter(n >= z) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 4) +
  geom_node_text(color = "red", aes(label = name), vjust = 1.8, size=3) +
  labs(title= "Word graph  - HF2 Framework",
       subtitle = paste("Pairwise analysis words >= ", z, " frequency",
       x = "", y = ""))
}

par(mfrow=c(2,2))


.Par_net(10)

.Par_net(5)

.Par_net(3)

3 Three-grams analysis

trio_Palavras <- TidyT %>%
  unnest_tokens(Palavra, text, token = "ngrams", n = 3) %>%
  separate(Palavra, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word,
         !word1 %in% my_stopwords$Palavra,
         !word2 %in% my_stopwords$Palavra,
         !word3 %in% my_stopwords$Palavra) %>%
  count(word1, word2, word3, sort = TRUE)

DT::datatable(trio_Palavras)

4 Correlation betweeen Pairwise words

We may instead want to examine correlation among words, which indicates how often they appear together relative to how often they appear separately.

In particular, here we’ll focus on the phi coefficient, a common measure for binary correlation. The focus of the phi coefficient is how much more likely it is that either both word X and Y appear, or neither do, than that one appears without the other.

The phi coefficient is equivalent to the Pearson correlation, which you may have heard of elsewhere, when it is applied to binary data).

The pairwise_cor() function in widyr lets us find the phi coefficient between words based on how often they appear in the same section. Its syntax is similar to pairwise_count().

library(widyr)
# we need to filter for at least relatively common words first
word_cor <- par_Palavras %>%
  group_by(word1) %>%
  filter(n() >= 5) %>%
  widyr::pairwise_cor(word1,word2, sort = TRUE)
  

word_cor2 <- par_Palavras %>%
  group_by(word2) %>%
  filter(n() >= 5) %>%
  widyr::pairwise_cor(word1,word2, sort = TRUE)

4.1 Correlation Barplot of top

Filter_word <- c("trabalho", "condicoes", "organização","gestao","acoes", "riscos", "repertório", "competência", "conhecimento", "procedimentos", "desing" , "habilidades", "ambiente", "segurança", "técnicas" )
  

.Dendogram <- function(x,y,z) {
  x %>%
  filter(item1 %in% y) %>%
  group_by(item1) %>%
  arrange(desc(item1))%>%
  top_n(z) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation, fill = item1)) +
  geom_col(show.legend = FALSE) +
  geom_bar(stat = "identity") +
  labs(x=NULL, y= NULL)+
  facet_wrap(~ item1, ncol = 2, scales = "free") +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered()
  
}

.Dendogram(word_cor, Filter_word,10)
## Selecting by correlation

.Dendogram(word_cor2, Filter_word,6)
## Selecting by correlation

5 Dendogram bigrams

palavras=word_cor
wordnetwork <- head(palavras, 100)
wordnetwork <- graph_from_data_frame(wordnetwork)
wordnetwork2 <- as.undirected(wordnetwork)
comm <- cluster_fast_greedy(wordnetwork2, weights = E(wordnetwork2)$n)
plot_dendrogram(comm, main="Pairwise word clusters dendogram", cex=.9, hang=-1)
## Warning in plot.window(...): "hang" não é um parâmetro gráfico
## Warning in plot.xy(xy, type, ...): "hang" não é um parâmetro gráfico
## Warning in title(...): "hang" não é um parâmetro gráfico

palavras=word_cor2
wordnetwork <- head(palavras, 40)
wordnetwork <- graph_from_data_frame(wordnetwork)
wordnetwork2 <- as.undirected(wordnetwork)
comm <- cluster_fast_greedy(wordnetwork2, weights = E(wordnetwork2)$n)
plot_dendrogram(comm, main="Pairwise word clusters dendogram", cex=.9, hang=-1)
## Warning in plot.window(...): "hang" não é um parâmetro gráfico
## Warning in plot.xy(xy, type, ...): "hang" não é um parâmetro gráfico
## Warning in title(...): "hang" não é um parâmetro gráfico

5.1 Correlation network

word_cor1 <- par_Palavras %>%
  group_by(word1) %>%
  filter(n() >= 15) %>%
  pairwise_cor(word1,word2, sort = TRUE)

word_cor1 %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

6 Referências

National Commission on the BP Deepwater Horizon Oil Spill and Offshore Drilling (NCDWHSOD). Deep Water: The Gulf Oil Disaster and the Future of Offshore Drilling. Report to the President. January 2011 Cover Photo: © Steadfast TV. ISBN: 978-0-16-087371-3. https://www.govinfo.gov/content/pkg/GPO-OILCOMMISSION/pdf/GPO-OILCOMMISSION.pdf

http://data7.blog/grafo-de-palavras-anitta-twitter/

Analise de palavras. Disponivel em: https://www.ufrgs.br/wiki-r/index.php?title=Frequ%C3%AAncia_das_palavras_e_nuvem_de_palavras Esta página foi modificada pela última vez em 12 de dezembro de 2018, às 19h30min Conteúdo disponível sob Creative Commons - Atribuição - Compartilha nos Mesmos Termos, salvo indicação em contrário.

https://p4husp.github.io/material/tutorial11/

Principal: https://www.tidytextmining.com/ngrams.html

Corpus and Machine Learning: https://rstudio-pubs-static.s3.amazonaws.com/265713_cbef910aee7642dc8b62996e38d2825d.html

Machine learning: https://kenbenoit.net/pdfs/text_analysis_in_R.pdf

Mineração de texto: https://www.rpubs.com/LaionBoaventura/mineracaodetexto