Capstone Project Milestone

Load needed libraries

library(tm)
library(plyr)
library(dplyr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(RWeka)
library(SnowballC)

Read and Investigate the Data

Data source: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

In this first phase of the project, we will be reading the files and extracting some basic information, such as size of the files, number of lines, number of words and more.

us_blogs_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
us_news_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
us_twitter_loc <- "D:/capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

con_blog <- file(us_blogs_loc) ; con_news <- file(us_news_loc) ; con_twit <- file(us_twitter_loc)

# File size in MB
Size_MegaBytes <- c(file.info(us_blogs_loc)$size/1048576,file.info(us_news_loc)$size/1048576,file.info(us_twitter_loc)$size/1048576)
# Read the files

blog <- readLines(con_blog, encoding = "UTF-8")
news <- readLines(con_news, encoding = "UTF-8")
twit <- readLines(con_twit, encoding = "UTF-8")

# Number of lines
Number_of_Lines <- c(length(blog), length(news),length(twit))
# Max Number of words in a line
Max_Words_in_Line <- c(max(unlist(lapply(blog,nchar))),max(unlist(lapply(news,nchar))), max(unlist(lapply(twit,nchar))))
# Total word count
Total_Number_of_Words <- c(ngram::wordcount(blog),ngram::wordcount(news),ngram::wordcount(twit))
Average_Words_per_Line <- Total_Number_of_Words/Number_of_Lines

data_table_combined <- data.frame(Source=c("Blogs","News","Twitter"),Size_MegaBytes,Number_of_Lines,Max_Words_in_Line,Total_Number_of_Words,Average_Words_per_Line)
knitr::kable(data_table_combined)

Source	Size_MegaBytes	Number_of_Lines	Max_Words_in_Line	Total_Number_of_Words	Average_Words_per_Line
Blogs	200.4242	899288	40833	37334131	41.51521
News	196.2775	77259	5760	2643969	34.22215
Twitter	159.3641	2360148	140	30373543	12.86934

Data Sampling and Merging

set.seed(12345)
sample_size <- 20000


sblog <- sample(blog,sample_size)
snews <- sample(news,sample_size)
stwit <- sample(twit,sample_size)

iblog <- iconv(sblog,"UTF-8","ASCII","byte")
inews <- iconv(snews,"UTF-8","ASCII","byte")
itwit <- iconv(stwit,"UTF-8","ASCII","byte")

# This step would help in development. 
# Each effort would start from reading the created sample file, 
# eliminating the time to read all the files in each session

#file_loc <- "D:/capstone/US_all_sample_data.txt"
#writeLines(US_all_sample_data,file_loc)

US_all_sample_data <- c(iblog,inews,itwit)

rm(blog,news,twit)
close(con_twit)
close(con_blog)
close(con_news)

Corpus creation

US_corpus_20k <- VCorpus(VectorSource(US_all_sample_data))

Corpus Transformation

According to “tm” package vignette, some necessary transformations are in order:

Remove Numbers
Remove Punctuation
Stemming
Remove “Stop words”
Conversion to lowercase

US_corpus_20k <- tm_map(US_corpus_20k,removeNumbers)
US_corpus_20k <- tm_map(US_corpus_20k,content_transformer(tolower))
US_corpus_20k <- tm_map(US_corpus_20k,removeWords,stopwords("english"))
US_corpus_20k <- tm_map(US_corpus_20k,removePunctuation)
US_corpus_20k <- tm_map(US_corpus_20k,stripWhitespace)
US_corpus_20k <- tm_map(US_corpus_20k,PlainTextDocument)

Exploratory Analysis

wordcloud(US_corpus_20k, scale=c(4,0.5), min.freq=10, max.words=150, random.order=TRUE,
          rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)

Tokenize Corpus

Next is the tokenization of the corpus. Two possible alternatives identified were to use either TDM (Term Document Matrix) or a data frame.

df_US_corpus_20k <- data.frame(text = unlist(sapply(US_corpus_20k, '[', 'content')), stringsAsFactors = F)

df_uniGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 1, max = 1))))
df_biGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 2, max = 2))))
df_triGramToken <- data.frame(table(NGramTokenizer(df_US_corpus_20k, Weka_control(min = 3, max = 3))))

# for dev purposes
#write.csv(df_uniGramToken,'unigram.csv')
#write.csv(df_biGramToken,'bigram.csv')
#write.csv(df_triGramToken,'trigram.csv')

Plot Top-25 Unigrams

#useful during dev
#df_uniGramToken <- read.csv('unigram.csv')
#order by decreasing frequency and keep 25, removing "e"
uniG <- head(df_uniGramToken[order(-df_uniGramToken$Freq),],26)
names(uniG) <- c("word","frequency")
data <- uniG %>% filter(word != 'e') 

ggplot2::ggplot(data, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("UniGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Plot Top-25 Bigrams

#useful during dev
#df_biGramToken <- read.csv('bigram.csv')
#order by decreasing frequency
BiGram <- head(df_biGramToken[order(-df_biGramToken$Freq),],25)
names(BiGram) <- c("word","frequency")

ggplot2::ggplot(BiGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("BiGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Plot Top-25 Trigrams

#useful during dev
#df_triGramToken <- read.csv('trigram.csv') 
#order by decreasing frequency
TriGram <- head(df_triGramToken[order(-df_triGramToken$Freq),],25)
names(TriGram) <- c("word","frequency")

ggplot2::ggplot(TriGram, aes(x=word, y=frequency, fill=word)) + geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("TriGram Frequency") + theme(legend.position = "none") + labs(x = NULL, y = NULL)

Conclusions

In this work, we have read three (3) different data sources (blogs, news and Twitter feeds). For all of them we took a sample, merged them into a single block of text (the corpus) and on this text we performed certain transformations (removed numbers, stop-words, punctuation and converted to lowercase)

For the merged dataset (the corpus) created, we plotted a “word cloud”, which was a color plot of the 150 most common words, with the words with higher frequency having larger size in the plot. We then “tokenized” the corpus in order to extract the frequency of the most common unigrams, bigrams and trigrams, so the most common words and combinations of 2 and 3 words respectively.

There were some basic assumptions in place, for the exploratory data analysis. Those were:

All 3 datasets contributed with the same “weight” (because they were merged and then the words were counted). This might not be the case, since Twitter feeds may contain different more-frequent words, than those of blogs for example. However in the context of generic word prediction, this may not be important.
Different writings of the same word, or identifying words spelled incorrectly, were not taken into consideration.
The same goes for names, so for example the name “Alexander” has 5 variations in the unigram generated.
We sampled the 3 datasets at 20% equally, while the datasets are not similar in size, so in reality, the sample are of different size for each source. This was done for simplicity.
Profanities were not removed. While this might change further down the process, for now it remains, to identify the frequency that un-wanted words may be predicted.

Moving forward

Yet remains to create the TDM (Document-term Matrix, https://en.wikipedia.org/wiki/Document-term_matrix ) from which we can further eliminate sparse terms. Also the sparsity of the terms can be compared to samples of different sizes, to see what is the ideal sample, that is neither too small, nor generates much fewer sparse terms than samples of larger size.
With the correct sample in space, we will be defining the problem of creating the n-grams and removing the last word in each, and then attempt to predict it with an algorithm.

N.Perdikis