Descriptions

To next, the R code corresponding to the functions used to made the exploratory analysis for the files en_US.twitter.txt, en_US.blogs.txt and en_US.news.txt

Functions

library(tidytext)
library(tokenizers)
library(tidyverse)
library(tidyr)
library(sentimentr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(ggraph)
library(igraph)

allWordsToRemove<-""
tokenizerData<-data.frame()
bi_grama<-data.frame()
bi_grama_2<-""
data <-""

getTokensFromText<-function(pathTofile1,pathTofile2,pathTofile3){
  
  startingLoadingDataDate<-Sys.time()
  
  information1<-read.delim(file = pathTofile1, header = FALSE)
  sample1<-sample(c(TRUE, FALSE), nrow(information1), replace=TRUE, prob=c(0.1,0.9))
  train1<- information1[sample1,]  
  
  information2<-read.delim(file = pathTofile2, header = FALSE)
  sample2<-sample(c(TRUE, FALSE), nrow(information2), replace=TRUE, prob=c(0.8,0.2))
  train2<- information2[sample2,]  
  
  information3<-read.delim(file = pathTofile3, header = FALSE)
  sample3<-sample(c(TRUE, FALSE), nrow(information3), replace=TRUE, prob=c(0.2,0.8))
  train3<- information3[sample3,]  
  
  data<<-c(train1,train2,train3)
  
  ##calling function that identify profanity words from text
  allWordsToRemove <<- getAllWordsToRemove(data)
  
}
getAllWordsToRemove<-function(data){
  
  ##The next two lines allow identify the terms that must be removed.
  sentencesFromData<-get_sentences(data)
  profanityDataFrame<-attributes(extract_profanity_terms(sentencesFromData))$elements%>%filter(profanity>0)
  uniqueProfanityWords<-unique(profanityDataFrame$words)
  
  customStopWords<-as.data.frame(uniqueProfanityWords)%>%mutate(lexicon='CUSTOM')
  colnames(customStopWords)<-c('word','lexicon')
  AllStopWords<-rbind(stop_words,customStopWords)
  
  AllStopWords
  
}
tokenize_data<-function(){
  
  startingTokenization<-Sys.time()
  print(" starting tokenization")
  
  tokensDataFrame <- tibble(data)%>%unnest_tokens(token = "words", output = "word", input = data)%>%
    anti_join(allWordsToRemove, by = join_by(word))%>%
    filter(!grepl('[0-9]', word)) %>%
    count(word, name="total", sort = TRUE)
  
  timeTokenization<-difftime( Sys.time(), startingTokenization,units = 'min')
  print(paste("Time tokenization", timeTokenization, sep = '->'))
  
  tokenizerData<<-tokensDataFrame
  
  print(head(tokenizerData,30))
}
visualizeFrequentTokens<-function(){
  
  tokenizerData%>%
    filter(total>tokenizerData[31,2]$total) %>%
    mutate(word = reorder(word, total))%>%
    ggplot(aes(total,word)) + geom_col() + labs(y=NULL)
}
create_bigram_data<-function(){
  
  startingTokenization<-Sys.time()
  print(" starting bigram-data")
    cleanDataDF <-as.data.frame(data)
    colnames(cleanDataDF)<-c('word')
    
    biGramData<-cleanDataDF%>%
      unnest_tokens('word',token = 'ngrams', n=2, output = 'bigram')%>%
      filter(!grepl('[0-9]', bigram)) %>%
      separate(bigram, c('word1','word2'), sep=" ")%>%
      filter(!(word1 %in% allWordsToRemove$word))%>%
      filter(!(word2 %in% allWordsToRemove$word))%>%
      count(word1, word2, sort = TRUE)%>%
      filter( n>100)
      
      bi_grama<<-biGramData%>%graph_from_data_frame()
  
      timeTokenization<-difftime( Sys.time(), startingTokenization,units = 'min')
      print(paste("Time buildin biGram data", timeTokenization, sep = '->'))
}
create_bigram_data_2<-function(numberGram, minimuNGram){
 
  bi_grama_2 <<- tokenize_ngrams(x = data,
                                      lowercase = TRUE,
                                     n = numberGram,
                                     n_min = minimuNGram,
                                     stopwords = allWordsToRemove$word,
                                     ngram_delim = " ",
                                     simplify = FALSE)
  
  print(head(bi_grama_2,6))
  
}
create_bigram_plot<-function(bigram_count){
  
  set.seed(2017)
  
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  ggraph(bigram_count, layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1)
}
``