Milestone Report

Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

library(quanteda)
library(wordcloud)
library(ggplot2)
library(tm)
library(stringi)
library(data.table)

Read data

Download the datasets, if they are not already download

# download file if it doesn't exist
if(!file.exists('Coursera-SwiftKey.zip')){
  download.file('https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip',
                destfile = './Coursera-SwiftKey.zip', method = 'curl', quiet = T)
  unzip('./Coursera-SwiftKey.zip')
}

if (!file.exists('badWords.txt')) {
  download.file('https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',
                dest = paste0(getwd(), 'badWords.txt'),
                method = 'curl', quiet = T)
  }

Read the datasets and display statistics of the three datasets

blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE, warn= FALSE)
twitter <- readLines("final/en_US/en_US.Twitter.txt", encoding = "UTF-8", skipNul = TRUE)

size.blogs <- file.size('final/en_US/en_US.blogs.txt')/1024^2
size.news <- file.size('final/en_US/en_US.news.txt')/1024^2
size.twitter <- file.size('final/en_US/en_US.twitter.txt')/1024^2

lineCount.blogs <- length(blogs)
lineCount.news <- length(news)
lineCount.twitter <- length(twitter)

WordCount.blogs <- sum(stri_count_words(blogs))
WordCount.news <- sum(stri_count_words(news))
WordCount.twitter <- sum(stri_count_words(twitter))

charcount.blogs <- sum(nchar(blogs))
charCount.news <- sum(nchar(news))
charCount.twitter <- sum(nchar(twitter))

summary <- data.frame(c('Blogs','News','Twitter'),
                      c(size.blogs, size.news, size.twitter), 
                      c(lineCount.blogs, lineCount.news, lineCount.twitter),
                      c(WordCount.blogs,WordCount.news,WordCount.twitter),
                      c(charcount.blogs,charCount.news,charCount.twitter))
colnames(summary) <- c('Files','Size (MB)', 'Lines',    'Words', 'Characters')

summary

##     Files Size (MB)   Lines    Words Characters
## 1   Blogs  200.4242  899288 37546239  206824505
## 2    News  196.2775   77259  2674536   15639408
## 3 Twitter  159.3641 2360148 30093413  162096241

We select just a subset the first 10000 lines of the data because is too big. A corpus is build which contains all the data: news, twitter and blogs.

dsize <- 10000
blogs <- readLines("final/en_US/en_US.blogs.txt",dsize, encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt",dsize, encoding = "UTF-8", skipNul = TRUE,
                  warn= FALSE)
twitter <- readLines("final/en_US/en_US.Twitter.txt",dsize, encoding = "UTF-8", skipNul = TRUE)
corpus <- Corpus(VectorSource(c(blogs, news, twitter)))

Cleaning data

Before performing exploratory data analysis, we should clean the data. Changing the text to lower case, split words and Removing: - URLs, - special characters, - punctuations - numbers - stopwords - symbols - badwords

Tokens <- tokens(
  x = tolower(corpus), what = c("word"),
  remove_punct = TRUE,
  remove_numbers = TRUE,
  split_hyphens = TRUE,
  remove_symbols = TRUE,
  remove_separators = TRUE,
  remove_url = TRUE
)

prof <- readLines('badWords.txt', encoding = "UTF-8", skipNul = TRUE)
 
Tokens1 <- tokens_remove(tokens(Tokens), stopwords("english"))
Tokens1 <- tokens_remove(Tokens1, pattern = prof)
words <- tokens_wordstem(Tokens1, language = "english")
corpus <- Corpus(VectorSource(words))

The most frequent unigrams

We can visualizar the WordCloud Representation as a way to see a sumaryze of the data:

wordcloud(corpus, max.words = 200, random.order = FALSE, 
           rot.per = 0.15, scale = c(2.9, 0.4), use.r.layout = FALSE, 
           colors = brewer.pal(8, "Dark2"))

Data preparation

We perform the exploratory data analysis.We want to find the most frequently occurring words in the data. Specifically the 20 most common unigrams, bigrams, and trigrams.

bi_gram <- tokens_ngrams(words, n = 2)
tri_gram <- tokens_ngrams(words, n = 3)

uni_DFM <- dfm(words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)

sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)

unigrams <- data.table(word_1 = names(sums_U), count = sums_U)

bigrams <- data.table(
  word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
  count = sums_B)

trigrams <- data.table(
  word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
  word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
  count = sums_T)


plot_unigrams <- unigrams[order(count, decreasing = TRUE)][1:20,]
plot_bigrams <- bigrams[order(count, decreasing = TRUE)][1:20,]
plot_trigrams <- trigrams[order(count, decreasing = TRUE)][1:20,]

Plots

ggplot(plot_unigrams, aes(x=reorder(word_1, count),y=count)) + 
        geom_bar(stat="identity", color= 'white', fill="grey") + 
        ggtitle("Most Common Unigrams") + 
        xlab("") +  
        ylab("Frequency") + 
        coord_flip() +
        theme(plot.title = element_text(hjust = 0.5))

ggplot(plot_bigrams, aes(x=reorder(paste(word_1,word_2), count),y=count)) + 
        geom_bar(stat="identity", color= 'white', fill="grey") + 
        ggtitle("Most Common Bigrams") + 
        xlab("") + 
        ylab("Frequency") + 
        coord_flip() +
        theme(plot.title = element_text(hjust = 0.5))

ggplot(plot_trigrams, aes(x=reorder(paste(word_1,word_2,word_3), count),y=count)) + 
        geom_bar(stat="identity", color= 'white', fill="grey") + 
        ggtitle(paste("Most Common Trigrams")) + 
        xlab("") + 
        ylab("Frequency") + 
        coord_flip() +
        theme(plot.title = element_text(hjust = 0.5))

Future goal

The goal is to create a predictive model which predicts the most probable words to follow an input from the user. The Shiny app should take as input a phrase (multiple words) in a text box input and output a prediction of the next word.

I will plan to use ngram model with a word frequency lookup similar to that performed in the exploratory data analysis.That way to calculate the probabilities of the next word occuring with respect to previous words. For the Shiny app, the plan is to create an app with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.