Executive Summary

The goal of this project is to create a predictive text model using the en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt.

This will show a brief exploratory analysis of this data to make sure your on your way to completing the final Capstone Project.

blogs <- readLines("~/Desktop/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/Desktop/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("~/Desktop/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
library(stringi)
blogs_size <- file.info("~/Desktop/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("~/Desktop/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("~/Desktop/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2

# Get words in files
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)

# Summary of the data sets
data.frame(source = c("blogs", "news", "twitter"),
           file.size.MB = c(blogs_size, news_size, twitter_size),
           num.lines = c(length(blogs), length(news), length(twitter)),
           num.words = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
           mean.num.words = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  37546250       41.75109
## 2    news     196.2775   1010242  34762395       34.40997
## 3 twitter     159.3641   2360148  30093413       12.75065

Cleaning the Data

Before the exploratory analysis I’m going to clean the data. I’m going to use 2% of the data for this analysis

library(tm)
## Loading required package: NLP
data.sample <- c(sample(blogs, length(blogs) * 0.01),
                 sample(news, length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))

# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Exploratory Anlaysis

Now that the data is cleaned of all punctuations, special characters, numbers and URL’s it’s time to put this data to graphs.

library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
options(mc.cores=1)

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
  ggplot(data[1:20,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency of words") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("orange"))
}

# Getting the  frequencies of most common n-grams in data
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
makePlot(freq1, "20 Most Common words")

makePlot(freq2, "20 Most Common two words")

makePlot(freq3, "20 Most Common the word string")

Conclusion for the Prediction Algorythm

This concludes the first part of the capstone project. The next step in the capstone project is to create a shiny app using the same data.