The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
Creating functions for graphs and cloud words
First, I will create the functions needed to plot the graphs.
#Building the unigram model function, I am removing the profanity words and stopwords, too.
unigram <- function(Sample){
Texte1 <- tokens(Sample, what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, split_hyphens = FALSE)
Texte1 <- tokens_tolower(Texte1)
Texte1 <- tokens_select(Texte1,stopwords("en"),selection = "remove")
Texte1 <- tokens_select(Texte1,UNLIST_PROFA,selection="remove")
texteunigram <- tokens_ngrams(Texte1, n=1)
return(texteunigram)
}
#Building the bigram model function, I am removing the profanity words and stopwords, too.
bigram <- function(Sample){
Texte1 <- tokens(Sample, what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, split_hyphens = FALSE)
Texte1 <- tokens_tolower(Texte1)
Texte1 <- tokens_select(Texte1,stopwords("en"),selection = "remove")
Texte1 <- tokens_select(Texte1,UNLIST_PROFA,selection="remove")
textebigram <- tokens_ngrams(Texte1, n=2)
return(textebigram)
}
#Building the trigram model function, I am removing the profanity words and stopwords, too
trigram <- function(Sample){
Texte1 <- tokens(Sample, what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, split_hyphens = FALSE)
Texte1 <- tokens_tolower(Texte1)
Texte1 <- tokens_select(Texte1,stopwords("en"),selection = "remove")
Texte1 <- tokens_select(Texte1,UNLIST_PROFA,selection="remove")
textetrigram <- tokens_ngrams(Texte1, n=3)
return(textetrigram)
}
repetWords <- readLines("txt.txt")
## Warning in readLines("txt.txt"): incomplete final line found on 'txt.txt'
repetWords <- str_split(repetWords, " ")
unlistrep <- unlist(repetWords)
repeated_words <- c(unlistrep,"p.m","lol", "rt") #lol and rt are for twitter as they are in the top five, I wnat to give opportunity to other words rather than those
repetbiWords <- readLines("txt_.txt")
## Warning in readLines("txt_.txt"): incomplete final line found on 'txt_.txt'
repetbiWords <- str_split(repetbiWords, " ")
unlistrepbi <- unlist(repetbiWords)
repeated_biwords <- c(unlistrepbi,"p.m","even_though", "can_get", "can_make", "year_old", "one_day","can_see","feel_like", "can_find", "make_sure","just_get","just_like","can_find", "every_day") #I chose these words because I did some trials to see the frequencies and to avoid stopwords.
#Creating the frequencies for unigram (function)
unig<-function(sample){
SampleBB_uni <- unigram(sample)
topfeatures(dfm(SampleBB_uni, tolower = TRUE, remove = repeated_words),5)
topUnigram <- textstat_frequency(dfm(SampleBB_uni, tolower = TRUE, remove = repeated_words), n = 5)
}
#Creating the frequencies for bigram (function)
big<-function(sample){
SampleBB_bi <- bigram(sample)
topfeatures(dfm(SampleBB_bi, tolower = TRUE, remove = repeated_biwords),5)
topBigram <- textstat_frequency(dfm(SampleBB_bi, tolower = TRUE, remove = repeated_words), n = 5)
}
#Creating the frequencies for trigram (function)
trig<-function(sample){
SampleBB_tri <- trigram(sample)
topfeatures(dfm(SampleBB_tri, tolower = TRUE, remove = repeated_biwords),5)
topTrigram <- textstat_frequency(dfm(SampleBB_tri, tolower = TRUE, remove = repeated_words), n = 5)
}
#Creating the graphs of frequencies for unigram model
graphsFreqUni <-function(sampled__){
UniBlogs <-unig(sampled__)
UniBlogs$feature <- with(UniBlogs, reorder(feature, -frequency))
UniBlogs<-ggplot(UniBlogs, aes(x = feature, y = frequency)) + geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return(UniBlogs)
}
#Creating the graphs of frequencies for bigram model
graphsFreqBi <-function(sampled__){
BiBlogs <-big(sampled__)
BiBlogs$feature <- with(BiBlogs, reorder(feature, -frequency))
BiBlogs<-ggplot(BiBlogs, aes(x = feature, y = frequency)) + geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return(BiBlogs)
}
#Creating the graphs of frequencies for trigram model
graphsFreqtri <-function(sampled__){
TriBlogs <-trig(sampled__)
TriBlogs$feature <- with(TriBlogs, reorder(feature, -frequency))
TriBlogs<-ggplot(TriBlogs, aes(x = feature, y = frequency)) + geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return(TriBlogs)
}
#Creating the cloud words function for the unigram model
cloudesplotUni <-function(sample){
SampleBB_uni <- unigram(sample)
textplot_wordcloud(dfm(SampleBB_uni, tolower = TRUE, remove = repeated_words), max_words = 20, color = rev(RColorBrewer::brewer.pal(4, "RdBu")),random_order = FALSE, random_color = TRUE,fixed_aspect = TRUE)
}
#Creating the cloud words function for the bigram model
cloudesplotbi <-function(sample){
SampleBB_bi <- bigram(sample)
textplot_wordcloud(dfm(SampleBB_bi, tolower = TRUE, remove = repeated_biwords), max_words = 20, color = rev(RColorBrewer::brewer.pal(4, "RdBu")),random_order = FALSE, random_color = TRUE,fixed_aspect = TRUE)
}
#Creating the cloud words function for the trigram model
cloudesplottri <-function(sample){
SampleBB_tri <- trigram(sample)
textplot_wordcloud(dfm(SampleBB_tri, tolower = TRUE, remove = repeated_biwords), max_words = 20, color = rev(RColorBrewer::brewer.pal(4, "RdBu")),random_order = FALSE, random_color = TRUE,fixed_aspect = TRUE)
}
Following, I am plotting the graphs for the uni-grams, 2-grams, and 3-grams considering 90% and 50% of probability.