# Text Mining Package
library(tm)
## Loading required package: NLP
# Plotting Package
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
# Quantitative Analysis for text data
library(quanteda)
## quanteda version 1.0.0
## Using 3 of 4 threads for parallel computing
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
The file is obtained from the url provided, and unzipped, and working directory set to the en_US folder.
Its faster and easier to perform some essestial cleanup of data using operating system commands, so that only clean data is brought into R.
System commands are run to remove apostrophes, numbers followed by letters (like 2nd, 20th etc), then the numbers are cleaned out, and then url’s are removed from the working data set.
# Run the following commands in the terminal window
# change directory to the path ~/data/final/en_US
# remove all apostrophes
sed -i "s/'//g" *.*
# remove all numbers followed by characters
sed -Ei "s/[0-9]+[Aa-Zz]+//g" *.*
# remove all numbers
sed -Ei "s/[0-9]+//g" *.*
# remove all url's
sed -Ei 's!http[s]\?://\S*!!g' *.*
setwd("~/data/final/en_US")
blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
twitter <- readLines("en_US.twitter.txt")
The data set is pretty large, as we can see from the following commands. This calls for sampling of data, so we can efficiently perform some analytics on the sampled data.
summary(blogs)
## the 10 most common values are:
##
## 146
## --
## 100
## Article
## 78
## :
## 45
## Acts :
## 39
## //
## 38
## -
## 32
## April ,
## 32
## Brewed: --
## 31
## Level : MP Cost: , Damage: %, Attacks up to enemies.
## 31
summary(news)
## the 10 most common values are:
## May , : PM EDT
## 181
##
## 140
## --
## 135
## Per serving: calories, g protein, g carbohydrate, g fat ( g saturated), mg cholesterol, mg sodium, g fiber.
## 107
## () -
## 98
## (-) : PDT WASHINGTON (AP) --
## 69
## Updated at : p.m.
## 67
## (-) : PDT WASHINGTON, (AP) --
## 66
## May , : AM EDT
## 65
## Academic rank: of
## 63
summary(twitter)
## the 10 most common values are:
## Thanks for the RT! Thank you! thank you!
## 571 547 382
## Thanks for the follow! Thanks for the mention! thanks for the RT!
## 327 188 185
## thank you thanks for the follow! thank you :)
## 183 178 159
## Youre welcome!
## 136
The correct way of sampling is to find out the number of distinct words “p” and the number of words per document “l” and the number of lines for a good model would be 10p/l.
Since we are not in the stage of finding the terms yet, we will consider about 15% of each document for our analysis.
Since sampling is a random function, we will set the seed for reproduciblity
set.seed(54321)
sample.blogs <- sample(blogs,size = length(blogs)*0.15,replace = FALSE)
sample.news <- sample(news,size = length(news)*0.15,replace = FALSE)
sample.tweets <- sample(twitter,size = length(twitter)*0.15,replace = FALSE)
# save sample blogs
write(sample.blogs,file = "~/data/sample/sample_blogs.txt")
# save sample news
write(sample.news,file = "~/data/sample/sample_news.txt")
# save sample tweets
write(sample.tweets,file = "~/data/sample/sample_tweets.txt")
# Cleanup
rm("blogs","news","twitter","sample.blogs","sample.news","sample.tweets")
# garbage collection
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1459356 78.0 6619081 353.5 6562027 350.5
## Vcells 5582838 42.6 109327496 834.2 136651497 1042.6
In this step, we form a corpora from the documents we have, and then apply some cleaning factors that will help us tokenize the corpora easily.
# build a corpora of the sampled documents
en.corpora <- VCorpus(DirSource('~/data/sample/'))
# create it into a corpus using the quanteda package
en.corpus <- corpus(en.corpora)
# convert to tokens by removing numbers,punctuation,symbols and hyphens
en.tokens <- tokens(en.corpus, what = "word", remove_numbers = TRUE, remove_punct = TRUE,remove_symbols = TRUE, remove_hyphens = TRUE)
# convert the tokens into lowercase
en.tokens <- tokens_tolower(en.tokens)
# remove stop words [stop words are very common words like the,an,a etc.]
en.tokens <- tokens_select(en.tokens, stopwords("english"),selection = "remove")
# remove profanity from the list obtained from github
# https://github.com/words/profanities
profanity <- readLines("~/data/final/en_US/gitbadlist.txt")
en.tokens <- tokens_select(en.tokens, profanity, selection = "remove")
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1719927 91.9 5295264 282.8 6619081 353.5
## Vcells 21093091 161.0 105018396 801.3 136651497 1042.6
# generate Unigrams
en.unigram <- tokens_ngrams(en.tokens,1)
# generate Bigrams
en.bigram <- tokens_ngrams(en.tokens,2)
# generate Trigrams
en.trigram <- tokens_ngrams(en.tokens,3)
# We need a function to calculate relative term frequency
# to get the average occurrence of a particular term, relative to the length of the document
term.frequency <- function(row) {
row / sum(row)
}
# compute the document frequency matrix
en.unigram.dfm <- dfm(en.unigram, tolower = FALSE)
# convert to a regular matrix
en.unigram.matrix <- as.matrix(en.unigram.dfm)
# compute the normalized term frequency
en.unigram.matrix <- apply(en.unigram.matrix, 1, term.frequency)
# compute the sum across documents
en.unigram.matrix <- apply(en.unigram.matrix, 1, sum)
# convert the top 25 values into a dataframe
tab_uni <- as.data.frame(head(sort(en.unigram.matrix,decreasing = TRUE),25))
# add a new column of rownames
tab_uni$terms <- rownames(tab_uni)
# rename the labels of the columns
names(tab_uni) <- c("Frequency","Unigram")
# plot the graph
ggplot(tab_uni, aes(x=reorder(Unigram,Frequency), y=Frequency )) +
geom_bar(stat="identity", width=.5, fill="tomato3") + coord_flip() +
xlab("Unigram") + ylab("Frequency") +
labs(title="Unigram Frequency Chart",
subtitle="Top 25 Normalized Terms ",
caption="source: tab_uni")
# compute the document frequency matrix
en.bigram.dfm <- dfm(en.bigram, tolower = FALSE)
# convert to a regular matrix
en.bigram.matrix <- as.matrix(en.bigram.dfm)
# compute the normalized term frequency
en.bigram.matrix <- apply(en.bigram.matrix, 1, term.frequency)
# compute the sum across documents
en.bigram.matrix <- apply(en.bigram.matrix, 1, sum)
# convert the top 25 values into a dataframe
tab_bi <- as.data.frame(head(sort(en.bigram.matrix,decreasing = TRUE),25))
# add a new column of rownames
tab_bi$terms <- rownames(tab_bi)
# rename the labels of the columns
names(tab_bi) <- c("Frequency","Bigram")
# plot the graph
ggplot(tab_bi, aes(x=reorder(Bigram,Frequency), y=Frequency )) +
geom_bar(stat="identity", width=.5, fill="springgreen3") + coord_flip() +
xlab("Bigram") + ylab("Frequency") +
labs(title="Bigram Frequency Chart",
subtitle="Top 25 Normalized Terms ",
caption="source: tab_bi")
# compute the document frequency matrix
en.trigram.dfm <- dfm(en.trigram, tolower = FALSE)
# convert to a regular matrix
en.trigram.matrix <- as.matrix(en.trigram.dfm)
# compute the normalized term frequency
en.trigram.matrix <- apply(en.trigram.matrix, 1, term.frequency)
# compute the sum across documents
en.trigram.matrix <- apply(en.trigram.matrix, 1, sum)
# convert the top 25 values into a dataframe
tab_tri <- as.data.frame(head(sort(en.trigram.matrix,decreasing = TRUE),25))
# add a new column of rownames
tab_tri$terms <- rownames(tab_tri)
# rename the labels of the columns
names(tab_tri) <- c("Frequency","Trigram")
# plot the graph
ggplot(tab_tri, aes(x=reorder(Trigram,Frequency), y=Frequency )) +
geom_bar(stat="identity", width=.5, fill="steelblue3") + coord_flip() +
xlab("Trigram") + ylab("Frequency") +
labs(title="Trigram Frequency Chart",
subtitle="Top 25 Normalized Terms ",
caption="source: tab_tri")
Our next step is to come up with a prediction model, that can source the information from the Ngrams and use an algorith like the randomforest. Once we have a model with a good prediction accuracy, that model will be used to predict the next best word.
A shiny application will be constructed around this prediction model, to test the model, which will have just in time tweaks like language selection and model selection.