1 Context of study
2 Study goal
- 2.1 Research question
3 Methods, Materials and Techniques
- 3.1 Materials
- 3.2 Techniques
4 Unigrams analysis
5 Bigrams analysis
- 5.1 Word graph network
6 Three-grams analysis
7 Correlation betweeen Pairwise words
- 7.1 Correlation Barplot of top
8 Dendogram bigrams
- 8.1 Correlation network
9 Referências

1 Context of study

Accidents Report Text Mining Analysis

Falar do contexto do projeto e como isso ajuda na formação dos indicadores

2 Study goal

The purpuse of this study is to discover knowledge about the causes of disasters in the oil and gas industries, in order to support the establishment of indicators in the Human Factors II Project (Libra Consortium, Brazil, 2007 - 2022).

2.1 Research question

Which indicators we can extract from The Gulf Oil Disaster Report?

3 Methods, Materials and Techniques

Descrever método ao finalizar as analises

3.1 Materials

Report of National Commission on the BP Deepwater Horizon Oil Spill and Offshore Drilling (NCDWHSOD). Deep Water: The Gulf Oil Disaster and the Future of Offshore Drilling. Report to the President. January 2011 Cover Photo: © Steadfast TV. ISBN: 978-0-16-087371-3. https://www.govinfo.gov/content/pkg/GPO-OILCOMMISSION/pdf/GPO-OILCOMMISSION.pdf

3.2 Techniques

1.Knowledge Discovery in Text techniques using n-grams analisys: - Unigrams - Bigrams - Triograms

R Packages

# Pacotes e Funções
library(tidyverse) # Manipulacao eficiente de dados
library(tidytext) # Manipulacao eficiente de texto
library(textreadr) # Leitura de pdf para texto
library(tm) # Pacote de mineracao de texto com stopwords 
library(wordcloud) # Grafico nuvem de palavras
library(igraph)
library(ggraph)
library(ggplot2)
library(dplyr)
library(pdftools)
library(RRPP)

Text Normalization Function

# Função para normalizar texto
NormalizaParaTextMining <- function(texto){
 
  # Normaliza texto
  texto %>% 
    chartr(
      old = "áéíóúÁÉÍÓÚýÝàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛãõÃÕñÑäëïöüÄËÏÖÜÿçÇ´`^~¨:.!?&$@#0123456789",
      new = "aeiouAEIOUyYaeiouAEIOUaeiouAEIOUaoAOnNaeiouAEIOUycC                       ",
      x = .) %>% # Elimina acentos e caracteres desnecessarios
    str_squish() %>% # Elimina espacos excedentes 
    tolower() %>% # Converte para minusculo
    return() # Retorno da funcao
}

Text Cleaning Function

# Função para Palavras que podem se retiradas

# Lista de palavras para remover
palavrasRemover <- c(stopwords(kind = "en"), letters) %>%
  as.tibble() %>% 
  rename(Palavra = value) %>% 
  mutate(Palavra = NormalizaParaTextMining(Palavra))

## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.

Read PDF

# Arquivo pdf
# Arquivo pdf
arquivoPdf <- "~/Text Mining/offshore-accident.pdf"

Tidynd text

TidyT <- arquivoPdf %>% 
  read_pdf() %>% 
  as.tibble() %>% 
  select(text)

4 Unigrams analysis

Cleaning useless words for single word analysis

  CleanW <- TidyT  %>% 
  unnest_tokens(Palavra, text) %>% 
  mutate(Palavra = NormalizaParaTextMining(Palavra)) %>% 
  anti_join(palavrasRemover)

## Joining, by = "Palavra"

Counting single words

  frequenciaPalavras <-  CleanW  %>% 
  count(Palavra, sort = TRUE) %>% 
  filter(Palavra != "") %>%
  arrange(desc(n)) 


# Visualiza frequencia de palavras
DT::datatable(frequenciaPalavras)

4.1 Fault tree

Fault tree Example

4.2 Top 15 words

Top_15 <- arrange(frequenciaPalavras[1:15,1:2], desc(n)) 


# plotando as top 15 palavras.
Top_15 %>%
  ggplot(aes(x = Palavra, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Contagem",
       x = "Palavras únicas",
       title = "Contagem de palavras únicas encontradas no Relatório")

4.3 Word clouds

# Cria nuvem de palavras
wordcloud(
  words = frequenciaPalavras$Palavra, 
  freq = frequenciaPalavras$n,
  min.freq = 2,
  max.words = 300, 
  random.order = FALSE, 
  rot.per = 0.35, 
  colors = brewer.pal(8, "Dark2")
)

5 Bigrams analysis

Analysing pairwise of words

Cleaning useless words and tidyng data for pairwise word analysis

# remove "www" "http"
my_stopwords <- tibble(Palavra = c(as.character(1:3), 
                                    "www","http", ","))

par_Palavras <- TidyT %>%
  unnest_tokens(Palavra, text, token = "ngrams", n = 2) %>% 
  mutate(Palavra = na.omit(NormalizaParaTextMining(Palavra))) %>%
  anti_join(palavrasRemover) %>%
  anti_join(my_stopwords) %>%
  separate(Palavra, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% my_stopwords$Palavra) %>%
  filter(!word2 %in% my_stopwords$Palavra) %>%
  count(word1, word2, sort = TRUE)

## Joining, by = "Palavra"Joining, by = "Palavra"

# remove NA
par_Palavras <- na.omit(par_Palavras)


DT::datatable(par_Palavras)

5.1 Word graph network

## Function to show word network
.Par_net = function(z) {
  par_Palavras %>%
  filter(n >= z) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 4) +
  geom_node_text(color = "red", aes(label = name), vjust = 1.8, size=3) +
  labs(title= "Word graph  - The Gulf Oil Disaster Report",
       subtitle = paste("Pairwise analysis words >= ", z, " frequency",
       x = "", y = ""))
}

par(mfrow=c(2,2))

.Par_net(7)

.Par_net(6)

.Par_net(5)

.Par_net(3)

6 Three-grams analysis

trio_Palavras <- TidyT %>%
  unnest_tokens(Palavra, text, token = "ngrams", n = 3) %>%
  separate(Palavra, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

DT::datatable(trio_Palavras)

7 Correlation betweeen Pairwise words

We may instead want to examine correlation among words, which indicates how often they appear together relative to how often they appear separately.

In particular, here we’ll focus on the phi coefficient, a common measure for binary correlation. The focus of the phi coefficient is how much more likely it is that either both word X and Y appear, or neither do, than that one appears without the other.

The phi coefficient is equivalent to the Pearson correlation, which you may have heard of elsewhere, when it is applied to binary data).

The pairwise_cor() function in widyr lets us find the phi coefficient between words based on how often they appear in the same section. Its syntax is similar to pairwise_count().

library(widyr)

## Warning: package 'widyr' was built under R version 3.6.3

# we need to filter for at least relatively common words first
word_cor <- par_Palavras %>%
  group_by(word1) %>%
  filter(n() >= 5) %>%
  widyr::pairwise_cor(word1,word2, sort = TRUE)

7.1 Correlation Barplot of top

  word_cor %>%
  filter(item1 %in% c("oil", "spill", "deepwater", "response", "safety", "enviromental", "bp")) %>%
  group_by(item1) %>%
  arrange(desc(item1))%>%
  top_n(12) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation, fill = item1)) +
  geom_col(show.legend = FALSE) +
  geom_bar(stat = "identity") +
  labs(x=NULL, y= NULL)+
  facet_wrap(~ item1, ncol = 2, scales = "free") +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered()

## Selecting by correlation

8 Dendogram bigrams

palavras=word_cor
wordnetwork <- head(palavras, 46)
wordnetwork <- graph_from_data_frame(wordnetwork)
wordnetwork2 <- as.undirected(wordnetwork)
comm <- cluster_fast_greedy(wordnetwork2, weights = E(wordnetwork2)$n)
plot_dendrogram(comm, main="Pairwise word clusters dendogram", cex=.9, hang=-1)

## Warning in plot.window(...): "hang" não é um parâmetro gráfico

## Warning in plot.xy(xy, type, ...): "hang" não é um parâmetro gráfico

## Warning in title(...): "hang" não é um parâmetro gráfico

8.1 Correlation network

word_cor1 <- par_Palavras %>%
  group_by(word1) %>%
  filter(n() >= 6) %>%
  pairwise_cor(word1,word2, sort = TRUE)

word_cor1 %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

9 Referências

National Commission on the BP Deepwater Horizon Oil Spill and Offshore Drilling (NCDWHSOD). Deep Water: The Gulf Oil Disaster and the Future of Offshore Drilling. Report to the President. January 2011 Cover Photo: © Steadfast TV. ISBN: 978-0-16-087371-3. https://www.govinfo.gov/content/pkg/GPO-OILCOMMISSION/pdf/GPO-OILCOMMISSION.pdf

http://data7.blog/grafo-de-palavras-anitta-twitter/

Analise de palavras. Disponivel em: https://www.ufrgs.br/wiki-r/index.php?title=Frequ%C3%AAncia_das_palavras_e_nuvem_de_palavras Esta página foi modificada pela última vez em 12 de dezembro de 2018, às 19h30min Conteúdo disponível sob Creative Commons - Atribuição - Compartilha nos Mesmos Termos, salvo indicação em contrário.

https://p4husp.github.io/material/tutorial11/

Principal: https://www.tidytextmining.com/ngrams.html

Corpus and Machine Learning: https://rstudio-pubs-static.s3.amazonaws.com/265713_cbef910aee7642dc8b62996e38d2825d.html

Machine learning: https://kenbenoit.net/pdfs/text_analysis_in_R.pdf

Mineração de texto: https://www.rpubs.com/LaionBoaventura/mineracaodetexto

Clustering Text Mining https://cran.r-project.org/web/packages/textmineR/vignettes/b_document_clustering.html

https://www.r-bloggers.com/clusters-of-texts/

https://gist.github.com/luccitan/b74c53adfe3b6dad1764af1cdc1f08b7