Swiftkey Capstone project - Milestone report

=============== SONJA OFFWOOD

Data exploration

Loading the data

To start off the analysis, we have to download the data and do some exploratory analysis on the data.

setwd("C:/Users/SJD/OneDrive - FRG/Own/Data Science/John Hopkins Specialisation/Capstone project/final/en_US")
con= file("en_US.twitter.txt", "r") 
doc_twitter = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)

con= file("en_US.blogs.txt", "r") 
doc_blogs = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)

con= file("en_US.news.txt", "r") 
doc_news = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)
rm(con)

Understanding the data

We want to see how many lines each text source has, following which we want to understand the number of words and characters each of the text components. It will also be useful to understand the max amount of words each of the text components have.

len_twitter=length(doc_twitter)
len_blogs=length(doc_blogs)
len_news=length(doc_news)

words_twitter=sapply(strsplit(doc_twitter, " "), length)
words_blogs=sapply(strsplit(doc_blogs, " "), length)
words_news=sapply(strsplit(doc_news, " "), length)

wordcount_twitter=sum(sapply(strsplit(doc_twitter, " "), length))
wordcount_blogs=sum(sapply(strsplit(doc_blogs, " "), length))
wordcount_news=sum(sapply(strsplit(doc_news, " "), length))

char_twitter=sum(sapply(strsplit(doc_twitter, ""), length))
char_blogs=sum(sapply(strsplit(doc_blogs, ""), length))
char_news=sum(sapply(strsplit(doc_news, ""), length))

max_twitter=max(words_twitter)
max_blogs=max(words_blogs)
max_news=max(words_news)

text_summary <- data.frame(
        fileName = c("Twitter","Blogs","News"),
        line_count = c(len_twitter, len_blogs, len_news),
        word_count = c(wordcount_twitter, wordcount_blogs, wordcount_news),
        char_count = c(char_twitter, char_blogs, char_news),
        max_length = c(max_twitter, max_blogs, max_news)
        )

text_summary

##   fileName line_count word_count char_count max_length
## 1  Twitter    2360148   30373583  162096241         47
## 2    Blogs     899288   37334131  206824505       6630
## 3     News      77259    2643969   15639408       1031

rm(len_twitter)
rm(len_blogs)
rm(len_news)
rm(words_twitter)
rm(words_blogs)
rm(words_news)
rm(char_twitter)
rm(char_blogs)
rm(char_news)
rm(max_twitter)
rm(max_blogs)
rm(max_news)

Preprocessing the data

Because of the large amount of data available, we need to sample a smaller selection of the existing text to run further analysis. To do this, we take 15% of each of the three data sets and combine them into the final set to be used.

set.seed(1234)
p=0.001
data_sample1 = sample(doc_twitter,p*length(doc_twitter))
data_sample2 = sample(doc_blogs,p*length(doc_blogs))
data_sample3 = sample(doc_news,p*length(doc_news))

full_data= c(data_sample1,data_sample2,data_sample3)

rm(doc_twitter)
rm(doc_blogs)
rm(doc_news)
rm(data_sample1)
rm(data_sample2)
rm(data_sample3)

To be able to perform some data transformations, we need to load the tm library.

library(NLP)
library(tm)

After the data is saved into a Corpus, the following preprocessing steps are performed on the data: 1. The punctuation is removed from the text 2. All numbers are removed 3. Convert all text to lower case letters 4. Remove stopwords from the text 5. Strip white spaces from the text

my_corpus = VCorpus(VectorSource(full_data)) 
my_corpus = tm_map(my_corpus, removePunctuation)
my_corpus = tm_map(my_corpus, removeNumbers)
my_corpus = tm_map(my_corpus,content_transformer(tolower))
my_corpus = tm_map(my_corpus, removeWords, stopwords("english"))
my_corpus = tm_map(my_corpus, stripWhitespace)

Word frequencies

We first load the libraries required for this section of the analysis.

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

In order to understand more about the word frequencies, we create a document term matrix which is then used for further analysis. We start off the analysis with plotting a wordcloud of the most frequent words

dtm = DocumentTermMatrix(my_corpus)
words_frequency = colSums(as.matrix(dtm)) 
suppressWarnings(wordcloud(words = names(words_frequency), freq = words_frequency, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.4, colors=brewer.pal(3, "Dark2")))

We next want to find the 20 most frequent words used in the datset post the transformations performed.

ord=order(words_frequency, decreasing=TRUE)
top20= head(words_frequency[ord], 20) 
top20_data=data.frame(words=names(top20), freq= top20)
top20_data$words=factor(top20_data$words, levels=top20_data$words)

g=ggplot(data=top20_data, aes(x=words, y=freq))
g=g+geom_bar(stat="identity", color="blue", fill="blue")
g=g+xlab("word")+ylab("frequency")
g

nGrams frequencies

To make this relevant to the Capstone project, we will want to have an understanding of which words generally follow each other. For this reason we want to look at the ngrams of the data and not only at the individual words. To do a similar analysis as what we did above, we will need to create tokenizers to pass into the document term matrix function.

library(RWeka)
BigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=2, max=2))}
TrigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=3, max=3))}
FourgramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=4, max=4))}

bigram = DocumentTermMatrix(my_corpus, control = list(tokenize = BigramTokenizer))
trigram = DocumentTermMatrix(my_corpus, control = list(tokenize = TrigramTokenizer))
fourgram = DocumentTermMatrix(my_corpus, control = list(tokenize = FourgramTokenizer))

Below we have a look at the top 20 bigrams on our dataset.

bigram_frequency = colSums(as.matrix(bigram)) 
bigram_ord=order(bigram_frequency, decreasing=TRUE)
bigram_top20= head(bigram_frequency[bigram_ord], 20) 
bigram_top20_data=data.frame(words=names(bigram_top20), freq= bigram_top20)
bigram_top20_data$words=factor(bigram_top20_data$words, levels=bigram_top20_data$words)

g=ggplot(data=bigram_top20_data, aes(x=words, y=freq))
g=g+geom_bar(stat="identity", color="red", fill="red")
g=g+xlab("word")+ylab("frequency")
g=g+theme(axis.text.x = element_text(angle = 90, hjust = 1))
g

Model for prediction

The current plan for the model is to return the word which has the highest frequency in the test set following the n words typed by the user. For this it is important not to remove the stop words when transforming the data as these will be critical in the prediction. The steps envisaged are set out below assuming the user types three words: 1. Find the word following the three types words in the 4-gram dictionary 2. If no match is found, find the word with the highest frequency following the last two words typed in the 3-gram dictionary 3. If still no match is found, find the word with the highest frequency following the last word typed 4. To the extent there is still no match, return a dummy word