Introduction

The goal of this project is to show the knowdlege that I have learned on working with the data and how to create my prediction algorithm.

The motivation for this project is to: 1. Demonstrate that I’ve downloaded the data and have successfully loaded it in. 2. Created a basic report of summary statistics about the data sets. 3. Report any interesting findings that I amassed so far. 4. Provide feedback on my plans for creating a prediction algorithm and Shiny app.

Loading Data

setwd("~/R/Capstone/final/en_US")
con <- file("en_US.twitter.txt", "r")
twitter<-readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)

con <- file("en_US.blogs.txt","r")
blogs<-readLines(con)
close(con)

con <- file("en_US.news.txt","r")
news<-readLines(con)
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
close(con)

Exploratory Analyses

Count the size of the data

library(stringi)
## Warning: package 'stringi' was built under R version 3.4.1
twitter_words<-stri_count_words(twitter)
blogs_words<-stri_count_words(blogs)
news_words<-stri_count_words(news)

#line counts for each file
nline_twitter<-length(twitter_words)
nline_blogs<-length(blogs_words)
nline_news<-length(news_words)
line<-c(nline_twitter,nline_blogs,nline_news)

#Word counts for each file
twitter_words<-sum(twitter_words)
blogs_words<-sum(blogs_words)
news_words<-sum(news_words)
words<-c(twitter_words,blogs_words,news_words)

#Average word counts per line for each file
ave_twitter_words<-twitter_words/nline_twitter
ave_blogs_words<-blogs_words/nline_blogs
ave_news_words<-news_words/nline_news
ave_words<-c(ave_twitter_words,ave_blogs_words,ave_news_words)

#Summary Table
table<-cbind(words,line,ave_words)
rownames(table) <- c("twitter", "blogs", "news")
table
##            words    line ave_words
## twitter 30279349 2360148  12.82943
## blogs   38872243  899288  43.22558
## news     2710587   77259  35.08442

Subset and Clean the data

library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
#We are only going to use 10% of the data
subTwitter <- sample(twitter, length(twitter)*0.01)
subBlogs <- sample(blogs, length(blogs)*0.01)
subNews <- sample(news, length(news)*0.01)

#All subseted data were combined to build the corpus 
combined_data <- c(subBlogs, subNews, subTwitter)
combined_data <- iconv(combined_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(combined_data, stringsAsFactors = FALSE))) 

toSpace <- content_transformer(function(x, pattern) 
                                {
                                    return (gsub(pattern, " ", x))
                                })


corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, " -")
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)

Create Ngrams Tokenization and usage frequency analyses

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.1
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2)) 
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3)) 

#Unigram frequency distribution
unigram.df <- data.frame(table(unigram))
unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]

plot1<-ggplot(unigram.df[1:15,], aes(x=reorder(unigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Unigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common words")
plot1

bigram.df <- data.frame(table(bigram))
bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]

plot2<-ggplot(bigram.df[1:15,], aes(x=reorder(bigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Bigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common word pairs")
plot2

trigram.df <- data.frame(table(trigram))
trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]

plot3<-ggplot(trigram.df[1:15,], aes(x=reorder(trigram, -Freq), y=Freq)) +
    geom_bar(stat = "identity")+ 
    xlab("Trigram") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=90, hjust=1))+
    labs(title = "Most common word triplets")

plot3

Future Work

Based on exploratory and n-gram models analysis result, the Shiny app should consist of the following improvments, features or functions:

1. A prediction model based on the n-gram

2. Building the n-grams model on complete data set

3. Further cleaning the data to improve prediction accuracy

4. Automatically updating the data set to personalize the prediction model