This milestone project intends an exploratory data analysis of the SwifKey data provided in the context of the Coursera Data Science Capstone. The data consist of 3 text files containing text from three different sources including blogs, news, and twitter.
We first load necessary libraries and read the data.
options(warn = -1)
library(stringi);library(tm);library(NLP)
## Loading required package: NLP
library(rJava);library(RWeka);library(RWekajars);library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("coursera-swiftkey.zip")){
download.file(url, destfile="coursera-swiftkey.zip")
}
setwd("/Users/Omid/R_online/Capstone/final/en_US/")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File Name" = c("twitter", "blogs", "news"),
"num.lines" = c(length(twitter),length(blogs), length(news)),
"num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)),
"Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))
## File.Name num.lines num.words Num.of.character
## 1 twitter 2360148 37570839 206824505
## 2 blogs 899288 34494539 203223159
## 3 news 1010242 30451128 162096031
We now run an exploratory data analysis for Data merge, Date sample and Date clean.
set.seed(12345)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
sample(blogs_c,length(blogs_c)*0.01),
sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult,5)
## text
## 1 gay guys walk one direction rt get fan d found funny
## 2 getting hungry sweet savory muffins cooked smbmad ur support going straight everyones tummy
## 3 another food truck friday cathedral square brewernation show us beastmode get order
## 4 corned beef definitely today
## 5 relatively painless hope
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
The following plot depicts the histogram of top 10 single words.
ggplot(unigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Unigrams",x="Most Frequent Single Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
The following plot depicts the histogram of top 10 pair words.
ggplot(bigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Bigrams",x="Most Frequent Pair Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
The following plot depicts the histogram of top 10 triple words.
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Trigrams",x="Most Frequent Triple Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
The next step of this capstone project is to develop a predictive model and present it using a shiny() app. The Shiny app consists of a user interface and a server which would allow us to enter text into a single textbox.