Coursera Data Science Capstone Milestone Report

Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

library(downloader)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
setwd("/Users/akhilaprasad/Downloads/final/en_US")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
library(stringi)
length(twitter)

## [1] 2360148

length(blogs)

## [1] 899288

length(news)

## [1] 1010242

twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitter),length(blogs), length(news)),
           "num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)),
           "Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))

##   File.Name num.lines num.words Num.of.character
## 1   twitter   2360148  37570839        206824505
## 2     blogs    899288  34494539        203223159
## 3      news   1010242  30451128        162096031

Exploratory Data Analysis

At this Part we will run some exploratory data analysis:Data merge, Date sample and Date clean In order to be able to clean and manipulate our data, we will create a corpus, which will consist of the three sample text files

set.seed(10000)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
library(tm)

## Loading required package: NLP

library(NLP)
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
              sample(blogs_c,length(blogs_c)*0.01),
               sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)

##                                                                                       text
## 1                                      tiger woods poker night phil hellmuth doyle brunson
## 2                                            suddenly feel start checking retirement homes
## 3                         acutely aware fact major blowers building sound good microphones
## 4                                                                        seem right avatar
## 5                                 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi

Build N-gram and Plot graph 1. unigram

library(RWeka)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

bigram i.e. two word combinations

bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
ggplot(bigramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

trigram i.e. three word combinations.

trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

Data Prediction

We now present what our predictive Shiny app will consist of and briefly describe an initial algorithm.

The User Interface (UI) part of the Shiny app will simply consist of a textInput widget where English text could be typed or pasted into by the user. Below the textInput widget, a submitButton or actionButton widget will enable the user to send the text to the predictive algorithm, in order to retrieve the most likely words to follow the given text. On the right of these widgets, a no-choose prompt selectInput will provide the top-3 or top-5 words to follow. A double-click action or Add button will add the text to the already typed phrase for future prediction.