Descriptions

To next, the R code corresponding to the functions used to made the exploratory analisys for the files en_US.twitter.txt, en_US.blogs.txt and en_US.news.txt

Functions

library(tidytext)
library(tidyr)
library(sentimentr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(igraph)
library(ggraph)

uniqueProfanityWords<-"fuck"
tokenizerData<-data.frame()
bi_grama<-data.frame()
dataWithoutProfanityWords <-""

getTokensFromText<-function(pathTofile){
  
  information<-read.delim(file = pathTofile, header = FALSE)
  sample<-sample(c(TRUE, FALSE), nrow(information), replace=TRUE, prob=c(0.7,0.3))
  train<- information[sample,]   
  data<-train

  ##calling function that remove profanity words from text and create a new text 
  dataWithoutProfanityWords <<- removeProfanityWords(data)
  
  ##create token for each word and counter for each appareance
  ##Also remove the words that belong to the stop_words list
  tokensDataFrame <- as.data.frame(dataWithoutProfanityWords)%>%
    unnest_tokens(token = "words", output = "word", input = dataWithoutProfanityWords)%>%
    filter(!grepl('[0-9]', word)) %>%
    anti_join(stop_words, by = join_by(word))%>%
    count(word, name="total", sort = TRUE)
  
  tokenizerData<<-tokensDataFrame
  
}
removeProfanityWords<-function(data){
  
  ##The next two lines allow identify the terms that must be removed.
  sentencesFromData<-get_sentences(data)
  profanityInformation<-extract_profanity_terms(sentencesFromData)
  #print(profanityInformation)
  
  ##This instruction update the "uniqueProfanityWords" var
  ##uniqueProfanityWords will be the regular expression used with gsub function
  lapply(profanityInformation$profanity, identifyProfanityWords)
  
  ##Applying subtituion on the original data
  ## \\b allow found exact word
  match<-paste("\\b",uniqueProfanityWords,"\\b", sep = "")
  newData <- gsub(match,replacement = "*****", x=data)
  
  newData
}
identifyProfanityWords<-function(profanityWordsBySentences){
 
  lapply(profanityWordsBySentences, existsOnUniqueProfanityWords)
 
}
existsOnUniqueProfanityWords<-function(word){
  
  match<-paste("\\b",word,"\\b", sep = "")
  exist<-grepl(pattern =match, x = uniqueProfanityWords )
  
  if(!exist){
    uniqueProfanityWords<<-paste(uniqueProfanityWords,word,sep = "|")
  }
}
visualizeTwentyMoreFrequentTokens<-function(){
  
  tokenizerData%>%filter(total > tokenizerData[21,2]) %>%mutate(word = reorder(word, total))%>%
    ggplot(aes(total,word)) + geom_col() + labs(y=NULL)
}
create_bigrams<-function(dataWithoutProfanityWords, biGramsGraterThat){
  
    cleanDataDF <-as.data.frame(dataWithoutProfanityWords)
    colnames(cleanDataDF)<-c('word')
    
    bi_grama_separated<-cleanDataDF%>%
      unnest_tokens('word',token = 'ngrams', n=2, output = 'bigram')%>%
      filter(!grepl('[0-9]', bigram)) %>%
      separate(bigram, c('word1','word2'), sep=" ")%>%
      filter(!(word1 %in% stop_words$word))%>%
      filter(!(word2 %in% stop_words$word))%>%
      count(word1, word2, sort = TRUE)
    
    bi_grama<<- bi_grama_separated%>%
      filter( n>biGramsGraterThat)%>%
      graph_from_data_frame()
    
    head(bi_grama_separated,20)
}
create_trigrams<-function(dataWithoutProfanityWords){
  
    cleanDataDF <-as.data.frame(dataWithoutProfanityWords)
    colnames(cleanDataDF)<-c('word')
    
    ngramsData<-cleanDataDF%>%
      unnest_tokens('word',token = 'ngrams', n=3, output = 'trigram')%>%
      filter(!grepl('[0-9]', trigram)) %>%
      anti_join(stop_words, by = join_by(word))%>%
      separate(trigram, c('word1','word2','word3'), sep=" ")%>%
      filter(!(word1 %in% stop_words$word))%>%
      filter(!(word2 %in% stop_words$word))%>%
      filter(!(word3 %in% stop_words$word))%>%
      unite(trigram, word1, word2, word3, sep = " ")%>%
      filter(!is.na(trigram))%>%
      count(trigram, sort = TRUE, name = "Total reps")
    
    head(ngramsData, 20)
}
create_bigram_plot<-function(bigram_count){
  
  set.seed(2017)
  
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  ggraph(bigram_count, layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1)
}
``