This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.
library(downloader)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
setwd("/Users/akhilaprasad/Downloads/final/en_US")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
library(stringi)
length(twitter)
## [1] 2360148
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File Name" = c("twitter", "blogs", "news"),
"num.lines" = c(length(twitter),length(blogs), length(news)),
"num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)),
"Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))
## File.Name num.lines num.words Num.of.character
## 1 twitter 2360148 37570839 206824505
## 2 blogs 899288 34494539 203223159
## 3 news 1010242 30451128 162096031
At this Part we will run some exploratory data analysis:Data merge, Date sample and Date clean In order to be able to clean and manipulate our data, we will create a corpus, which will consist of the three sample text files
set.seed(10000)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
library(tm)
## Loading required package: NLP
library(NLP)
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
sample(blogs_c,length(blogs_c)*0.01),
sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
## text
## 1 tiger woods poker night phil hellmuth doyle brunson
## 2 suddenly feel start checking retirement homes
## 3 acutely aware fact major blowers building sound good microphones
## 4 seem right avatar
## 5 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi
Build N-gram and Plot graph 1. unigram
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
ggplot(bigramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
We now present what our predictive Shiny app will consist of and briefly describe an initial algorithm.
The User Interface (UI) part of the Shiny app will simply consist of a textInput widget where English text could be typed or pasted into by the user. Below the textInput widget, a submitButton or actionButton widget will enable the user to send the text to the predictive algorithm, in order to retrieve the most likely words to follow the given text. On the right of these widgets, a no-choose prompt selectInput will provide the top-3 or top-5 words to follow. A double-click action or Add button will add the text to the already typed phrase for future prediction.