Load needed libraries

library(tm)
library(plyr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(RWeka)
library(SnowballC)

Read and Investigate the Data

Data source: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

In this first phase of the project, we will be reading the files and extracting some basic information, such as size of the files, number of lines, number of words and more.

us_blogs_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
us_news_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
us_twitter_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

con_blog <- file(us_blogs_loc) ; con_news <- file(us_news_loc) ; con_twit <- file(us_twitter_loc)

# File size in MB
Size_MegaBytes <- c(file.info(us_blogs_loc)$size/1048576,file.info(us_news_loc)$size/1048576,file.info(us_twitter_loc)$size/1048576)
# Read the files

blog <- readLines(con_blog, encoding = "UTF-8")
news <- readLines(con_news, encoding = "UTF-8")
twit <- readLines(con_twit, encoding = "UTF-8")

# Number of lines
Number_of_Lines <- c(length(blog), length(news),length(twit))
# Max Number of words in a line
Max_Words_in_Line <- c(max(unlist(lapply(blog,nchar))),max(unlist(lapply(news,nchar))), max(unlist(lapply(twit,nchar))))
# Total word count
Total_Number_of_Words <- c(ngram::wordcount(blog),ngram::wordcount(news),ngram::wordcount(twit))
Average_Words_per_Line <- Total_Number_of_Words/Number_of_Lines

data_table_combined <- data.frame(Source=c("Blogs","News","Twitter"),Size_MegaBytes,Number_of_Lines,Max_Words_in_Line,Total_Number_of_Words,Average_Words_per_Line)
knitr::kable(data_table_combined)
Source Size_MegaBytes Number_of_Lines Max_Words_in_Line Total_Number_of_Words Average_Words_per_Line
Blogs 200.4242 899288 40833 37334131 41.51521
News 196.2775 77259 5760 2643969 34.22215
Twitter 159.3641 2360148 140 30373543 12.86934

Data Sampling and Merging

set.seed(12345)
sample_size <- 20000


sblog <- sample(blog,sample_size)
snews <- sample(news,sample_size)
stwit <- sample(twit,sample_size)

iblog <- iconv(sblog,"UTF-8","ASCII","byte")
inews <- iconv(snews,"UTF-8","ASCII","byte")
itwit <- iconv(stwit,"UTF-8","ASCII","byte")

# This step would help in development. 
# Each effort would start from reading the created sample file, 
# eliminating the time to read all the files in each session

#file_loc <- "D:/capstone/US_all_sample_data.txt"
#writeLines(US_all_sample_data,file_loc)

US_all_sample_data <- c(iblog,inews,itwit)

rm(blog,news,twit)
close(con_twit)
close(con_blog)
close(con_news)

Corpus creation

US_corpus_20k <- VCorpus(VectorSource(US_all_sample_data))

Corpus Transformation

According to “tm” package vignette, some necessary transformations are in order:

US_corpus_20k <- tm_map(US_corpus_20k,removeNumbers)
US_corpus_20k <- tm_map(US_corpus_20k,content_transformer(tolower))
US_corpus_20k <- tm_map(US_corpus_20k,removeWords,stopwords("english"))
US_corpus_20k <- tm_map(US_corpus_20k,removePunctuation)
US_corpus_20k <- tm_map(US_corpus_20k,stripWhitespace)
US_corpus_20k <- tm_map(US_corpus_20k,PlainTextDocument)

Exploratory Analysis

wordcloud(US_corpus_20k, scale=c(4,0.5), min.freq=10, max.words=150, random.order=TRUE,
          rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)

Tokenize Corpus

Next is the tokenization of the corpus. Two possible alternatives identified were to use either TDM (Term Document Matrix) or a data frame.

df_US_corpus_20k <- data.frame(text = unlist(sapply(US_corpus_20k, '[', 'content')), stringsAsFactors = F)

df_uniGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 1, max = 1))))
df_biGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 2, max = 2))))
df_triGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 3, max = 3))))

# for dev purposes
#write.csv(df_uniGramToken,'unigram.csv')
#write.csv(df_biGramToken,'bigram.csv')
#write.csv(df_triGramToken,'trigram.csv')

Plot Top-25 Unigrams

#useful during dev
#df_uniGramToken <- read.csv('unigram.csv')
#order by decreasing frequency and keep 25, removing "e"
uniG <- head(df_uniGramToken[order(-df_uniGramToken$Freq),],26)
names(uniG) <- c("word","frequency")
data <- uniG %>% filter(word != 'e') 

ggplot2::ggplot(data, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("UniGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Plot Top-25 Bigrams

#useful during dev
#df_biGramToken <- read.csv('bigram.csv')
#order by decreasing frequency
BiGram <- head(df_biGramToken[order(-df_biGramToken$Freq),],25)
names(BiGram) <- c("word","frequency")

ggplot2::ggplot(BiGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("BiGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Plot Top-25 Trigrams

#useful during dev
#df_triGramToken <- read.csv('trigram.csv') 
#order by decreasing frequency
TriGram <- head(df_triGramToken[order(-df_triGramToken$Freq),],25)
names(TriGram) <- c("word","frequency")

ggplot2::ggplot(TriGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("TriGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Conclusions

In this work, we have read three (3) different data sources (blogs, news and Twitter feeds). For all of them we took a sample, merged them into a single block of text (the corpus) and on this text we performed certain transformations (removed numbers, stop-words, punctuation and converted to lowercase)

For the merged dataset (the corpus) created, we plotted a “word cloud”, which was a color plot of the 150 most common words, with the words with higher frequency having larger size in the plot. We then “tokenized” the corpus in order to extract the frequency of the most common unigrams, bigrams and trigrams, so the most common words and combinations of 2 and 3 words respectively.

There were some basic assumptions in place, for the exploratory data analysis. Those were:

  1. All 3 datasets contributed with the same “weight” (because they were merged and then the words were counted). This might not be the case, since Twitter feeds may contain different more-frequent words, than those of blogs for example. However in the context of generic word prediction, this may not be important.
  2. Different writings of the same word, or identifying words spelled incorrectly, were not taken into consideration.
  3. The same goes for names, so for example the name “Alexander” has 5 variations in the unigram generated.
  4. We sampled the 3 datasets at 20% equally, while the datasets are not similar in size, so in reality, the sample are of different size for each source. This was done for simplicity.
  5. Profanities were not removed. While this might change further down the process, for now it remains, to identify the frequency that un-wanted words may be predicted.

Moving forward

Yet remains to create the TDM (Document-term Matrix, https://en.wikipedia.org/wiki/Document-term_matrix ) from which we can further eliminate sparse terms. Also the sparsity of the terms can be compared to samples of different sizes, to see what is the ideal sample, that is neither too small, nor generates much fewer sparse terms than samples of larger size.
With the correct sample in space, we will be defining the problem of creating the n-grams and removing the last word in each, and then attempt to predict it with an algorithm.

N.Perdikis