Capstone Milestone Project

Introduction

This milestone project intends an exploratory data analysis of the SwifKey data provided in the context of the Coursera Data Science Capstone. The data consist of 3 text files containing text from three different sources including blogs, news, and twitter.

Load Library & Data

We first load necessary libraries and read the data.

options(warn = -1)
library(stringi);library(tm);library(NLP)

## Loading required package: NLP

library(rJava);library(RWeka);library(RWekajars);library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

url  <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if (!file.exists("coursera-swiftkey.zip")){
  download.file(url, destfile="coursera-swiftkey.zip")
}

setwd("/Users/Omid/R_online/Capstone/final/en_US/")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")

twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))

data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitter),length(blogs), length(news)),
           "num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)),
           "Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))

##   File.Name num.lines num.words Num.of.character
## 1   twitter   2360148  37570839        206824505
## 2     blogs    899288  34494539        203223159
## 3      news   1010242  30451128        162096031

Exploratory Data Analysis

We now run an exploratory data analysis for Data merge, Date sample and Date clean.

set.seed(12345)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")

sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
              sample(blogs_c,length(blogs_c)*0.01),
               sample(news_c,length(news_c)*0.01))

corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)

head(corpusresult,5)

##                                                                                          text
## 1                                        gay guys walk one direction rt get fan d found funny
## 2 getting hungry sweet savory muffins cooked smbmad ur support going straight everyones tummy
## 3        another food truck friday cathedral square brewernation show us beastmode get order 
## 4                                                                corned beef definitely today
## 5                                                                    relatively painless hope

unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]

The following plot depicts the histogram of top 10 single words.

ggplot(unigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Unigrams",x="Most Frequent Single Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]

The following plot depicts the histogram of top 10 pair words.

ggplot(bigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Bigrams",x="Most Frequent Pair Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]

The following plot depicts the histogram of top 10 triple words.

ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey50"))+
labs(title="Trigrams",x="Most Frequent Triple Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

Next Step

The next step of this capstone project is to develop a predictive model and present it using a shiny() app. The Shiny app consists of a user interface and a server which would allow us to enter text into a single textbox.

Capstone Milestone Project

Omid

2021-03-28

Introduction

Load Library & Data

Exploratory Data Analysis

Next Step