Exploratory analysis

The goal of this report is to describe major features of the data and describe plan for creating a prediction algorithm.

Objective

downloaded the data and have successfully loaded it in.

Create a basic report of summary statistics about the data sets.

Report any interesting findings that you amassed so far.

Get feedback on your plans for creating a prediction algorithm and Shiny app
knitr::opts_chunk$set(echo = TRUE)
# Load required Library
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)

#Read Data



blogs <- readLines(con <- file("./Desktop/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)

news <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)

twitter <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)

Basic Summary Line Count

length(blogs)
length(news)
## [1] 1
length(twitter)

Word counts

blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
c(sum(blogs.words), sum(news.words), sum(twitter.words))

Descriptive stat of each document

summary(nchar(blogs))
summary(nchar(news))
summary(nchar(twitter))

Analysis

# Data will be sampled

set.seed(10000)
sampleblogs <- sample(blogs, size = length(blogs)*0.05)
samplenews <- sample(news, size = length(news)*0.05)
sampletwitter <- sample(twitter, size = length(twitter)*0.05)
data.sample <- sample(paste(sampleblogs, samplenews, sampletwitter), size = 10000, replace = TRUE)

# Generate corpus and clean the data
corpus <- Corpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
                                                                  x))})

corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Tokenize Data and Plot N-gram Frequency

knit_engines$set()
#Tokenize Data
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))

#Plot Unigram Frequency
freqTerm <- findFreqTerms(uniGramMatrix, lowfreq = 5000)
termFreq <- rowSums(as.matrix(uniGramMatrix[freqTerm,]))
termFreq <- data.frame(unigram = names(termFreq), frequency = termFreq)

p <- ggplot(termFreq, aes(x = reorder(unigram, frequency), y = frequency)) + 
  geom_bar(stat = "identity") + xlab("unigram") + ylab("frequency") + 
  labs(title = "UniGram Frequency")
print(p)

Summary

The Next steps in the process is to finalize a predictive model based on the analysis above using N-gram and create the shinyapp based on that model.

The Shinyapp will allow user to input text and return a few predictive word.