To initialize the text mining, load the required packages

library(tm)
library(slam)
library(quanteda)
library(qdap)
library(SnowballC)
library(RWeka)
library(reshape2)

Next next step is to load the data sets. We have three types of data sets: 1. Twitter 2. Blogs 3. News

To load the above data sets we use the following commands

con <- file("Z:/Ferguson 13 march/other assignment/DS/Coursera/Courses/Capstone/data/en_US/en_US.twitter.txt", "r")
twitterData <- readLines(con, -1L)
## Warning in readLines(con, -1L): line 167155 appears to contain an embedded
## nul
## Warning in readLines(con, -1L): line 268547 appears to contain an embedded
## nul
## Warning in readLines(con, -1L): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(con, -1L): line 1759032 appears to contain an embedded
## nul
con2 <- file("Z:/Ferguson 13 march/other assignment/DS/Coursera/Courses/Capstone/data/en_US/en_US.news.txt", "r")
newsData <- readLines(con2, -1L)
## Warning in readLines(con2, -1L): incomplete final line found on 'Z:/
## Ferguson 13 march/other assignment/DS/Coursera/Courses/Capstone/data/en_US/
## en_US.news.txt'
con3 <- file("Z:/Ferguson 13 march/other assignment/DS/Coursera/Courses/Capstone/data/en_US/en_US.blogs.txt", "r")
blogsData <- readLines(con3, -1L) 

close(con3)

The next step is to clean the data.

The following commands take sample size of only 5% of the total text from twitter, blog and news.

sampleSize <- .05
set.seed(1234)
newsSubSet <- sample(newsData, length(newsData)*sampleSize)
blogsSubSet <- sample(blogsData, length(blogsData)*sampleSize)
twitterSubSet <- sample(twitterData, length(twitterData)*sampleSize)
subSet <- sample(c(newsSubSet, blogsSubSet, twitterSubSet))

Processing text data is computationally intensive. For this we use the packages doParallel which uses multiple computing cores to speedup the process.

library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(4)

Next to perform the analyses on our data we have to shape our data in a way where it can be used by the text mining libraries for specialized analyses

subset  <- sent_detect(subSet, language = "en", model = NULL)
corpus <- VCorpus(VectorSource(subSet))

The following command performs the filtering, transformation and removing of unrequired words/ phases.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)

*******************************Single gram*******************************

In this section we are going to find those single words that are more frequently used

MakeNGram = function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
td.mat <-  TermDocumentMatrix(corpus, control = list(tokenize = MakeNGram))
dtm <- removeSparseTerms(td.mat, 0.999)
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm   <- dtm[rowTotals> 0, ] #Remove all docs without words
mat <- as.matrix(dtm)
v <- sort(rowSums(mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
highFreq <- findFreqTerms(dtm, lowfreq = 500)
head(highFreq)
## [1] "â<U+0080><U+0093>"    "â<U+0080><U+0094>"    "â<U+0080><U+009C>"    "abl"    "about"  "accept"
sum(d[1:50,2])/sum(d[,2])
## [1] 0.3869332
sum(d[1:100,2])/sum(d[,2])
## [1] 0.4981427

The two numbers above shows that approximately 50 and 100 top words covers fifty percent and 80 percent of the total words in the data set

The graph below shows the 10 most frequency 3 gram words in the dataset

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

*******************************2-gram*************************************

In this section we are going to find those bi-gram/2 gram words that are more frequently used

MakeNGram2 = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
td.mat2 <-  TermDocumentMatrix(corpus, control = list(tokenize = MakeNGram2))
dtm <- removeSparseTerms(td.mat2, 0.999)
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm   <- dtm[rowTotals> 0, ] #Remove all docs without words
mat <- as.matrix(dtm)
v <- sort(rowSums(mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
highFreq <- findFreqTerms(dtm, lowfreq = 500)
head(highFreq)
## [1] "a big"   "a bit"   "a coupl" "a few"   "a good"  "a great"
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="yellow", main ="Most frequent words-2-gram",
        ylab = "Word frequencies")

##******************************3- gram******************************** In this section we are going to find those 3 gram words that are more frequently used

MakeNGram3 = function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
td.mat3 <-  TermDocumentMatrix(corpus, control = list(tokenize = MakeNGram3))
dtm <- removeSparseTerms(td.mat3, 0.999)
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm   <- dtm[rowTotals> 0, ] #Remove all docs without words
mat <- as.matrix(dtm)
v <- sort(rowSums(mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
highFreq <- findFreqTerms(dtm, lowfreq = 500)
head(highFreq)
## [1] "a lot of"  "be abl to" "go to be"  "i have a"  "i have to" "i need to"
sum(d[1:50,2])/sum(d[,2])
## [1] 0.5018794
sum(d[1:100,2])/sum(d[,2])
## [1] 0.7744218

The two numbers above shows that approximately 50 and 100 top words covers fifty percent and 80 percent of the total words in the data set

The graph below shows the 10 most frequency 3 gram words in the dataset

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="green", main ="Most frequent words",
        ylab = "Word frequencies")

Future action

Now given we have sample our text data to see the trend for the different words combinations, the next step is to use predictive algorithms specific to text mining find the accurate models that can predict next word.

After developing the predictive model, we will deploy the model on shiny(web based application) that on the basis of accuracy will give the option for frequency recurring words.