The objective of the project is to build a tool that can predict the next word as the user types. The tool uses a corpus of sentences from different areas - news, blogs and twitter microblog site.
This report presents the exploratory analysis of the data in preparation for the tool build.
The steps followed in this analysis are
The data has been processed separately for blogs, news and twitter dataset. Within each dataset, only a sample has been analysed due to the large volume of the data.
Load the necessary libraries
library(datasets); library(ggplot2); library(stats); library(graphics)
library(tm); library(stringi); library(Matrix); library(slam); library(RTextTools)
library(stringr); library("RWeka")
Read the blogs text data into R. The data has been downloaded from the link provided in the project page and manually unzipped to retrieve the text files.
set.seed(7777)
bCon <- file("en_US/en_US.blogs.txt", "r") # file is opened in readonly mode
blogTextFull <- readLines(bCon, encoding="UTF-8")
blogText <- sample(blogTextFull, size=round(length(blogTextFull)*.05));
close(bCon)
# load the swear words list in preparation for data cleansing. The word list has been downloaded from the URL http://www.bannedwordlist.com/
sCon <- file("en_US/swearWords.txt", "r")
swearText <- readLines(sCon, encoding="UTF-8")
close(sCon)
This stage of the process uses the text mining R package “tm”. The data is converted to the Corpus class and a series of transformations such as conversion to lower case, stripping punctuations and removing swear words are applied to create a Corpus ready for analysis.
#prepare blogs dataset
blogText <- str_replace_all(blogText,"[^[:graph:]]", " ")
blogText <- iconv(blogText, to='UTF-8', sub='byte')
bCorpus <- Corpus(VectorSource(blogText), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
bCorpus <- tm_map(bCorpus, content_transformer(tolower))
bCorpus <- tm_map(bCorpus, stripWhitespace)
bCorpus <- tm_map(bCorpus, removeNumbers)
bCorpus <- tm_map(bCorpus, removeWords, swearText)
Tokenize the text and analyze the frequency etc.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
btdm1 <- TermDocumentMatrix(bCorpus, control = list(tokenize = UnigramTokenizer))
btdm1Sum <- as.matrix(rollup(btdm1, 2, FUN=sum))
btdm1MostFreq20 <- sort(rowSums(btdm1Sum), decreasing=TRUE)[1:20]
btdm2 <- TermDocumentMatrix(bCorpus, control = list(tokenize = BigramTokenizer))
btdm2Sum <- as.matrix(rollup(btdm2, 2, FUN=sum))
btdm2MostFreq20 <- sort(rowSums(btdm2Sum), decreasing=TRUE)[1:20]
btdm3 <- TermDocumentMatrix(bCorpus, control = list(tokenize = TrigramTokenizer))
btdm3Sum <- as.matrix(rollup(btdm3, 2, FUN=sum))
btdm3MostFreq20 <- sort(rowSums(btdm3Sum), decreasing=TRUE)[1:20]
btdm4 <- TermDocumentMatrix(bCorpus, control = list(tokenize = QuadrigramTokenizer))
btdm4Sum <- as.matrix(rollup(btdm4, 2, FUN=sum))
btdm4MostFreq20 <- sort(rowSums(btdm4Sum), decreasing=TRUE)[1:20]
Sample dataset for this analysis contains 44964 documents and 77313 unique words.
Below charts show the n-gram phrase frequency for uni-gram, bi-gram and tri-gram for the top 20 terms.
barplot(btdm1MostFreq20, names.arg=names(btdm1MostFreq20),
main="Term Frequency Count - Blogs", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm2MostFreq20, names.arg=names(btdm2MostFreq20),
main="Term Frequency Count - Blogs", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm3MostFreq20, names.arg=names(btdm3MostFreq20),
main="Term Frequency Count - Blogs", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
barplot(btdm4MostFreq20, names.arg=names(btdm4MostFreq20),
main="Term Frequency Count - Blogs", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
rm(list=ls()) # clean up in prep for next dataset.
Load the necessary libraries
library(datasets); library(ggplot2); library(stats); library(graphics)
library(tm); library(stringi); library(Matrix); library(slam); library(RTextTools)
library(stringr); library("RWeka")
Read the text data into R. The data has been downloaded from the link provided in the project page and manually unzipped to retrieve the text files.
set.seed(7777)
bCon <- file("en_US/en_US.news.txt", "r") # file is opened in readonly mode
blogTextFull <- readLines(bCon, encoding="UTF-8")
blogText <- sample(blogTextFull, size=round(length(blogTextFull)*.05));
close(bCon)
# load the swear words list in preparation for data cleansing.
# The word list has been downloaded from the URL http://www.bannedwordlist.com/
sCon <- file("en_US/swearWords.txt", "r")
swearText <- readLines(sCon, encoding="UTF-8")
close(sCon)
This stage of the process uses the text mining R package “tm”. The data is converted to the Corpus class and a series of transformations such as conversion to lower case, stripping punctuations and removing swear words are applied to create a Corpus ready for analysis.
#prepare dataset
blogText <- str_replace_all(blogText,"[^[:graph:]]", " ")
blogText <- iconv(blogText, to='UTF-8', sub='byte')
bCorpus <- Corpus(VectorSource(blogText), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
bCorpus <- tm_map(bCorpus, content_transformer(tolower))
bCorpus <- tm_map(bCorpus, stripWhitespace)
bCorpus <- tm_map(bCorpus, removeNumbers)
bCorpus <- tm_map(bCorpus, removeWords, swearText)
Tokenize the text and analyze the frequency etc.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
btdm1 <- TermDocumentMatrix(bCorpus, control = list(tokenize = UnigramTokenizer))
btdm1Sum <- as.matrix(rollup(btdm1, 2, FUN=sum))
btdm1MostFreq20 <- sort(rowSums(btdm1Sum), decreasing=TRUE)[1:20]
btdm2 <- TermDocumentMatrix(bCorpus, control = list(tokenize = BigramTokenizer))
btdm2Sum <- as.matrix(rollup(btdm2, 2, FUN=sum))
btdm2MostFreq20 <- sort(rowSums(btdm2Sum), decreasing=TRUE)[1:20]
btdm3 <- TermDocumentMatrix(bCorpus, control = list(tokenize = TrigramTokenizer))
btdm3Sum <- as.matrix(rollup(btdm3, 2, FUN=sum))
btdm3MostFreq20 <- sort(rowSums(btdm3Sum), decreasing=TRUE)[1:20]
btdm4 <- TermDocumentMatrix(bCorpus, control = list(tokenize = QuadrigramTokenizer))
btdm4Sum <- as.matrix(rollup(btdm4, 2, FUN=sum))
btdm4MostFreq20 <- sort(rowSums(btdm4Sum), decreasing=TRUE)[1:20]
Sample dataset for this analysis contains 3863 documents and 17821 unique words.
Below charts show the n-gram phrase frequency for uni-gram, bi-gram and tri-gram for the top 20 terms.
barplot(btdm1MostFreq20, names.arg=names(btdm1MostFreq20),
main="Term Frequency Count - News", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm2MostFreq20, names.arg=names(btdm2MostFreq20),
main="Term Frequency Count - News", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm3MostFreq20, names.arg=names(btdm3MostFreq20),
main="Term Frequency Count - News", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
barplot(btdm4MostFreq20, names.arg=names(btdm4MostFreq20),
main="Term Frequency Count - News", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
rm(list=ls()) # clean up in prep for next dataset.
Note that the code portion is same as blogs section - with only file name change for the load part.
Load the necessary libraries
library(datasets); library(ggplot2); library(stats); library(graphics)
library(tm); library(stringi); library(Matrix); library(slam); library(RTextTools)
library(stringr); library("RWeka")
Read the text data into R. The data has been downloaded from the link provided in the project page and manually unzipped to retrieve the text files.
set.seed(7777)
bCon <- file("en_US/en_US.twitter.txt", "r") # file is opened in readonly mode
blogTextFull <- readLines(bCon, encoding="UTF-8")
blogText <- sample(blogTextFull, size=round(length(blogTextFull)*.05));
close(bCon)
# Load the swear words list in preparation for data cleansing.
# The word list has been downloaded from the URL http://www.bannedwordlist.com/
sCon <- file("en_US/swearWords.txt", "r")
swearText <- readLines(sCon, encoding="UTF-8")
close(sCon)
This stage of the process uses the text mining R package “tm”. The data is converted to the Corpus class and a series of transformations such as conversion to lower case, stripping punctuations and removing swear words are applied to create a Corpus ready for analysis.
#prepare dataset
blogText <- str_replace_all(blogText,"[^[:graph:]]", " ")
blogText <- iconv(blogText, to='UTF-8', sub='byte')
bCorpus <- Corpus(VectorSource(blogText), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
bCorpus <- tm_map(bCorpus, content_transformer(tolower))
bCorpus <- tm_map(bCorpus, stripWhitespace)
bCorpus <- tm_map(bCorpus, removeNumbers)
bCorpus <- tm_map(bCorpus, removeWords, swearText)
Tokenize the text and analyze the frequency etc.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
btdm1 <- TermDocumentMatrix(bCorpus, control = list(tokenize = UnigramTokenizer))
btdm1Sum <- as.matrix(rollup(btdm1, 2, FUN=sum))
btdm1MostFreq20 <- sort(rowSums(btdm1Sum), decreasing=TRUE)[1:20]
btdm2 <- TermDocumentMatrix(bCorpus, control = list(tokenize = BigramTokenizer))
btdm2Sum <- as.matrix(rollup(btdm2, 2, FUN=sum))
btdm2MostFreq20 <- sort(rowSums(btdm2Sum), decreasing=TRUE)[1:20]
btdm3 <- TermDocumentMatrix(bCorpus, control = list(tokenize = TrigramTokenizer))
btdm3Sum <- as.matrix(rollup(btdm3, 2, FUN=sum))
btdm3MostFreq20 <- sort(rowSums(btdm3Sum), decreasing=TRUE)[1:20]
btdm4 <- TermDocumentMatrix(bCorpus, control = list(tokenize = QuadrigramTokenizer))
btdm4Sum <- as.matrix(rollup(btdm4, 2, FUN=sum))
btdm4MostFreq20 <- sort(rowSums(btdm4Sum), decreasing=TRUE)[1:20]
Sample dataset for this analysis contains 118007 documents and 69858 unique words.
Below charts show the n-gram phrase frequency for uni-gram, bi-gram and tri-gram for the top 20 terms.
barplot(btdm1MostFreq20, names.arg=names(btdm1MostFreq20),
main="Term Frequency Count - Twitter", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm2MostFreq20, names.arg=names(btdm2MostFreq20),
main="Term Frequency Count - Twitter", ylab="Frequency", col=rainbow(20),
las=2, cex.names=1)
barplot(btdm3MostFreq20, names.arg=names(btdm3MostFreq20),
main="Term Frequency Count - Twitter", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
barplot(btdm4MostFreq20, names.arg=names(btdm4MostFreq20),
main="Term Frequency Count - Twitter", ylab="Frequency", col=rainbow(20),
las=2, cex.names=0.95)
Ingo Feinerer, Kurt Hornik, and David Meyer (2008). Text Mining Infrastructure in R. Journal of Statistical Software 25(5): 1-54. URL: http://www.jstatsoft.org/v25/i05/.