Exploratory analysis
The goal of this report is to describe major features of the data and describe plan for creating a prediction algorithm.
Objective
downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app
knitr::opts_chunk$set(echo = TRUE)
# Load required Library
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
#Read Data
blogs <- readLines(con <- file("./Desktop/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
news <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
twitter <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
Basic Summary Line Count
length(blogs)
length(news)
## [1] 1
length(twitter)
Word counts
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
c(sum(blogs.words), sum(news.words), sum(twitter.words))
Descriptive stat of each document
summary(nchar(blogs))
summary(nchar(news))
summary(nchar(twitter))
Analysis
# Data will be sampled
set.seed(10000)
sampleblogs <- sample(blogs, size = length(blogs)*0.05)
samplenews <- sample(news, size = length(news)*0.05)
sampletwitter <- sample(twitter, size = length(twitter)*0.05)
data.sample <- sample(paste(sampleblogs, samplenews, sampletwitter), size = 10000, replace = TRUE)
# Generate corpus and clean the data
corpus <- Corpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
x))})
corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Tokenize Data and Plot N-gram Frequency
knit_engines$set()
#Tokenize Data
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))
#Plot Unigram Frequency
freqTerm <- findFreqTerms(uniGramMatrix, lowfreq = 5000)
termFreq <- rowSums(as.matrix(uniGramMatrix[freqTerm,]))
termFreq <- data.frame(unigram = names(termFreq), frequency = termFreq)
p <- ggplot(termFreq, aes(x = reorder(unigram, frequency), y = frequency)) +
geom_bar(stat = "identity") + xlab("unigram") + ylab("frequency") +
labs(title = "UniGram Frequency")
print(p)
Summary
The Next steps in the process is to finalize a predictive model based on the analysis above using N-gram and create the shinyapp based on that model.
The Shinyapp will allow user to input text and return a few predictive word.