To next, the R code corresponding to the functions used to made the exploratory analysis for the files en_US.twitter.txt, en_US.blogs.txt and en_US.news.txt
library(tidytext)
library(tokenizers)
library(tidyverse)
library(tidyr)
library(sentimentr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(ggraph)
library(igraph)
allWordsToRemove<-""
tokenizerData<-data.frame()
bi_grama<-data.frame()
bi_grama_2<-""
data <-""
getTokensFromText<-function(pathTofile1,pathTofile2,pathTofile3){
startingLoadingDataDate<-Sys.time()
information1<-read.delim(file = pathTofile1, header = FALSE)
sample1<-sample(c(TRUE, FALSE), nrow(information1), replace=TRUE, prob=c(0.1,0.9))
train1<- information1[sample1,]
information2<-read.delim(file = pathTofile2, header = FALSE)
sample2<-sample(c(TRUE, FALSE), nrow(information2), replace=TRUE, prob=c(0.8,0.2))
train2<- information2[sample2,]
information3<-read.delim(file = pathTofile3, header = FALSE)
sample3<-sample(c(TRUE, FALSE), nrow(information3), replace=TRUE, prob=c(0.2,0.8))
train3<- information3[sample3,]
data<<-c(train1,train2,train3)
##calling function that identify profanity words from text
allWordsToRemove <<- getAllWordsToRemove(data)
}
getAllWordsToRemove<-function(data){
##The next two lines allow identify the terms that must be removed.
sentencesFromData<-get_sentences(data)
profanityDataFrame<-attributes(extract_profanity_terms(sentencesFromData))$elements%>%filter(profanity>0)
uniqueProfanityWords<-unique(profanityDataFrame$words)
customStopWords<-as.data.frame(uniqueProfanityWords)%>%mutate(lexicon='CUSTOM')
colnames(customStopWords)<-c('word','lexicon')
AllStopWords<-rbind(stop_words,customStopWords)
AllStopWords
}
tokenize_data<-function(){
startingTokenization<-Sys.time()
print(" starting tokenization")
tokensDataFrame <- tibble(data)%>%unnest_tokens(token = "words", output = "word", input = data)%>%
anti_join(allWordsToRemove, by = join_by(word))%>%
filter(!grepl('[0-9]', word)) %>%
count(word, name="total", sort = TRUE)
timeTokenization<-difftime( Sys.time(), startingTokenization,units = 'min')
print(paste("Time tokenization", timeTokenization, sep = '->'))
tokenizerData<<-tokensDataFrame
print(head(tokenizerData,30))
}
visualizeFrequentTokens<-function(){
tokenizerData%>%
filter(total>tokenizerData[31,2]$total) %>%
mutate(word = reorder(word, total))%>%
ggplot(aes(total,word)) + geom_col() + labs(y=NULL)
}
create_bigram_data<-function(){
startingTokenization<-Sys.time()
print(" starting bigram-data")
cleanDataDF <-as.data.frame(data)
colnames(cleanDataDF)<-c('word')
biGramData<-cleanDataDF%>%
unnest_tokens('word',token = 'ngrams', n=2, output = 'bigram')%>%
filter(!grepl('[0-9]', bigram)) %>%
separate(bigram, c('word1','word2'), sep=" ")%>%
filter(!(word1 %in% allWordsToRemove$word))%>%
filter(!(word2 %in% allWordsToRemove$word))%>%
count(word1, word2, sort = TRUE)%>%
filter( n>100)
bi_grama<<-biGramData%>%graph_from_data_frame()
timeTokenization<-difftime( Sys.time(), startingTokenization,units = 'min')
print(paste("Time buildin biGram data", timeTokenization, sep = '->'))
}
create_bigram_data_2<-function(numberGram, minimuNGram){
bi_grama_2 <<- tokenize_ngrams(x = data,
lowercase = TRUE,
n = numberGram,
n_min = minimuNGram,
stopwords = allWordsToRemove$word,
ngram_delim = " ",
simplify = FALSE)
print(head(bi_grama_2,6))
}
create_bigram_plot<-function(bigram_count){
set.seed(2017)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_count, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
}
``