Milestone Report

Loading the required libraries

# Text Mining Package
library(tm)

## Loading required package: NLP

# Plotting Package
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

# Quantitative Analysis for text data
library(quanteda)

## quanteda version 1.0.0

## Using 7 of 8 threads for parallel computing

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

Obtaining the Data

The file is obtained from the url provided, and unzipped, and working directory set to the en_US folder.

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

download.file(url,destfile="~/data.zip")

unzip(zipfile="~/data.zip",exdir="~/data")

setwd("~/data/final/en_US")

Cleanup using system commands

Its faster and easier to perform some essestial cleanup of data using operating system commands, so that only clean data is brought into R.

System commands are run to remove apostrophes, numbers followed by letters (like 2nd, 20th etc), then the numbers are cleaned out, and then url’s are removed from the working data set.

# Run the following commands in the terminal window 
# change directory to the path ~/data/final/en_US

# remove all apostrophes
sed -i  "s/'//g" *.*

# remove all numbers followed by characters
sed -Ei  "s/[0-9]+[A-Za-z]+//g" *.*

# remove all numbers
sed -Ei  "s/[0-9]+//g" *.*
  
# remove all url's
sed -Ei  's!http[s]\?://\S*!!g' *.*

Read the raw files into R objects

setwd("~/data/final/en_US")

blogs <- readLines("en_US.blogs.txt")

news <- readLines("en_US.news.txt")

twitter <- readLines("en_US.twitter.txt")

Exploring the Data

The data set is pretty large, as we can see from the following commands. This calls for sampling of data, so we can efficiently perform some analytics on the sampled data.

str(blogs)

##  chr [1:899288] "In the years thereafter, most of the Oil fields and platforms were named after pagan âgodsâ." ...

str(news)

##  chr [1:77259] "He wasnt home alone, apparently." ...

str(twitter)

##  chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...

Sampling the Data

The correct way of sampling is to find out the number of distinct words “p” and the number of words per document “l” and the number of lines for a good model would be 10p/l.

Since we are not in the stage of finding the terms yet, we will consider about 15% of each document for our analysis.

Since sampling is a random function, we will set the seed for reproduciblity

set.seed(54321)

sample.blogs <- sample(blogs,size = length(blogs)*0.15,replace = FALSE)

sample.news <- sample(news,size = length(news)*0.15,replace = FALSE)

sample.tweets <- sample(twitter,size = length(twitter)*0.15,replace = FALSE)

Saving the Samples and performing cleanup

# remove non english characters
sample.blogs <- iconv(sample.blogs,from = "latin1",to = "ASCII",sub = "")

# save sample blogs
write(sample.blogs,file = "~/data/sample/sample_blogs.txt")

# remove non english characters
sample.news <- iconv(sample.news,from = "latin1",to = "ASCII",sub = "")

# save sample news
write(sample.news,file = "~/data/sample/sample_news.txt")

# remove non english characters
sample.tweets <- iconv(sample.tweets,from = "latin1",to = "ASCII",sub = "")

# save sample tweets
write(sample.tweets,file = "~/data/sample/sample_tweets.txt")

# Cleanup
rm("blogs","news","twitter","sample.blogs","sample.news","sample.tweets")

# garbage collection
gc()

##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1800833 96.2    6619081 353.5  6086488 325.1
## Vcells 4889365 37.4   67677657 516.4 71710216 547.2

Tokenizing the Data

In this step, we form a corpora from the documents we have, and then apply some cleaning factors that will help us tokenize the corpora easily.

setwd("~/")

# build a corpora of the sampled documents
en.corpora <- VCorpus(DirSource('~/data/sample/'))

# create it into a corpus using the quanteda package
en.corpus <- corpus(en.corpora)

# convert to tokens by removing numbers,punctuation,symbols and hyphens
en.tokens <- tokens(en.corpus, what = "word", remove_numbers = TRUE, remove_punct = TRUE,remove_symbols = TRUE, remove_hyphens = TRUE, remove_url = TRUE)

# convert the tokens into lowercase
en.tokens <- tokens_tolower(en.tokens)

# remove stop words [stop words are very common words like the,an,a etc.]
en.tokens <- tokens_select(en.tokens, stopwords("english"),selection = "remove")

# remove profanity from the list obtained from github
# https://github.com/words/profanities
profanity <- readLines("~/data/final/en_US/gitbadlist.txt")
en.tokens <- tokens_select(en.tokens, profanity, selection = "remove")

Cleanup the large objects and run garbage collection

# remove huge Vcorpus
rm("en.corpora","en.corpus","profanity")

# run garbage collection
gc()

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2019429 107.9    6619081 353.5  6086488 325.1
## Vcells 8255116  63.0   65034550 496.2 81175891 619.4

Ngram the Tokens

# generate Unigrams
en.unigram <- tokens_ngrams(en.tokens,1)

# generate Bigrams
en.bigram <- tokens_ngrams(en.tokens,2)

# generate Trigrams
en.trigram <- tokens_ngrams(en.tokens,3)

Calculate normal term frequency

# We need a function to calculate relative term frequency
# to get the average occurrence of a particular term, relative to the length of the document
term.frequency <- function(row) {
row / sum(row)
}

Unigram Term Frequency

# compute the document frequency matrix
en.unigram.dfm <- dfm(en.unigram, tolower = FALSE)

# convert to a regular matrix
en.unigram.matrix <- as.matrix(en.unigram.dfm)

# compute the normalized term frequency
en.unigram.matrix <- apply(en.unigram.matrix, 1, term.frequency)

# compute the sum across documents
en.unigram.matrix <- apply(en.unigram.matrix, 1, sum)

Unigram Plot

# convert the top 25 values into a dataframe
tab_uni <- as.data.frame(head(sort(en.unigram.matrix,decreasing = TRUE),25))

# add a new column of rownames
tab_uni$terms <- rownames(tab_uni)

# rename the labels of the columns
names(tab_uni) <- c("Frequency","Unigram")

# plot the graph
ggplot(tab_uni, aes(x=reorder(Unigram,Frequency), y=Frequency )) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") +  coord_flip() + 
  xlab("Unigram") + ylab("Frequency") +
  labs(title="Unigram Frequency Chart", 
       subtitle="Top 25 Normalized Terms ", 
       caption="source: tab_uni")

Bigram Term Frequency

# compute the document frequency matrix
en.bigram.dfm <- dfm(en.bigram, tolower = FALSE)

# convert to a regular matrix
en.bigram.matrix <- as.matrix(en.bigram.dfm)

# compute the normalized term frequency
en.bigram.matrix <- apply(en.bigram.matrix, 1, term.frequency)

# compute the sum across documents
en.bigram.matrix <- apply(en.bigram.matrix, 1, sum)

Bigram Plot

# convert the top 25 values into a dataframe
tab_bi <- as.data.frame(head(sort(en.bigram.matrix,decreasing = TRUE),25))

# add a new column of rownames
tab_bi$terms <- rownames(tab_bi)

# rename the labels of the columns
names(tab_bi) <- c("Frequency","Bigram")

# plot the graph
ggplot(tab_bi, aes(x=reorder(Bigram,Frequency), y=Frequency )) + 
  geom_bar(stat="identity", width=.5, fill="springgreen3") +  coord_flip() + 
  xlab("Bigram") + ylab("Frequency") +
  labs(title="Bigram Frequency Chart", 
       subtitle="Top 25 Normalized Terms ", 
       caption="source: tab_bi")

Trigram Term Frequency

# compute the document frequency matrix
en.trigram.dfm <- dfm(en.trigram, tolower = FALSE)

# convert to a regular matrix
en.trigram.matrix <- as.matrix(en.trigram.dfm)

# compute the normalized term frequency
en.trigram.matrix <- apply(en.trigram.matrix, 1, term.frequency)

# compute the sum across documents
en.trigram.matrix <- apply(en.trigram.matrix, 1, sum)

Trigram Plot

# convert the top 25 values into a dataframe
tab_tri <- as.data.frame(head(sort(en.trigram.matrix,decreasing = TRUE),25))

# add a new column of rownames
tab_tri$terms <- rownames(tab_tri)

# rename the labels of the columns
names(tab_tri) <- c("Frequency","Trigram")

# plot the graph
ggplot(tab_tri, aes(x=reorder(Trigram,Frequency), y=Frequency )) + 
  geom_bar(stat="identity", width=.5, fill="steelblue3") +  coord_flip() + 
  xlab("Trigram") + ylab("Frequency") +
  labs(title="Trigram Frequency Chart", 
       subtitle="Top 25 Normalized Terms ", 
       caption="source: tab_tri")

Next Steps

Our next step is to come up with a prediction model, that can source the information from the Ngrams and use an algorith like the randomforest. Once we have a model with a good prediction accuracy, that model will be used to predict the next best word.

A shiny application will be constructed around this prediction model, to test the model, which will have just in time tweaks like language selection and model selection.