This document explains the major features of the data and briefly summarize my plans for creating the prediction algorithm and Shiny app. The motivation for this project is to: 1. download the data and successfully load it in. 2. Create a basic report of summary statistics about the data sets. 3. Report interesting findings that I have found so far.4. Get feedback on my plans for creating a prediction algorithm and Shiny app.
setwd("/Users/lt/Desktop/data sciences/capstone/final/en_US")
txt <- dir()
con <- file(txt[1], "r")
blogs <- readLines(con)
close(con)
con <- file(txt[2], "r")
news <- readLines(con)
close(con)
con <- file(txt[3], "r")
twitter <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)
com_data <- list(blogs, news, twitter)
line_count <- sapply(com_data, length)
char_count <- sapply(com_data, function(x) sum(nchar(x)))
data.frame(file_name=txt,line_count,char_count)
## file_name line_count char_count
## 1 en_US.blogs.txt 899288 206824505
## 2 en_US.news.txt 1010242 203223159
## 3 en_US.twitter.txt 2360148 162096031
Only a random subset (1% of each dataset) is used for the exploratory data analysis. They are then corpused as a corpus_sample for analysis purpose.
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tm)
## Loading required package: tm
## Loading required package: NLP
set.seed(1234)
blog_sample <- sample(blogs, size=length(blogs)*0.01)
news_sample <- sample(news, size=length(news)*0.01)
twitter_sample <- sample(twitter, size=length(twitter)*0.01)
corpus_sample <- VCorpus(VectorSource(list(blog_sample, news_sample, twitter_sample)))
require(tm)
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
blogs.dtm <- DocumentTermMatrix(VCorpus(VectorSource(blog_sample)))
blogs.dtms <- removeSparseTerms(blogs.dtm , 0.999)
blog_freq<-sort(colSums(as.matrix(blogs.dtms)), decreasing=TRUE)
hist(blog_freq, breaks = 1000)
wordcloud(names(blog_freq), blog_freq, min.freq=100, max.words=100)
require(tm)
require(wordcloud)
news.dtm <- DocumentTermMatrix(VCorpus(VectorSource(news_sample)))
news.dtms <- removeSparseTerms(news.dtm , 0.999)
news_freq<-sort(colSums(as.matrix(news.dtms)), decreasing=TRUE)
hist(news_freq, breaks = 1000)
wordcloud(names(news_freq), news_freq, min.freq=100, max.words=100)
require(tm)
require(wordcloud)
twitter.dtm <- DocumentTermMatrix(VCorpus(VectorSource(twitter_sample)))
twitter.dtms <- removeSparseTerms(twitter.dtm , 0.999)
twitter_freq<-sort(colSums(as.matrix(twitter.dtms)), decreasing=TRUE)
hist(twitter_freq, breaks = 1000)
wordcloud(names(twitter_freq), twitter_freq, min.freq=100, max.words=100)
Remove redundant words, numbers, punctuation, spaces in the sample dataset.
require(tm)
require(SnowballC)
## Loading required package: SnowballC
corpus_sample <- tm_map(corpus_sample, content_transformer(function(x) iconv(x, to="UTF-8",sub="byte")))
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))
corpus_sample <- tm_map(corpus_sample, content_transformer(removePunctuation))
corpus_sample <- tm_map(corpus_sample, content_transformer(removeNumbers))
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus_sample <- tm_map(corpus_sample, content_transformer(removeURL))
corpus_sample <- tm_map(corpus_sample, stripWhitespace)
#corpus_sample <- tm_map(corpus_sample, removeWords, stopwords("english"))
#corpus_sample <- tm_map(corpus_sample, removeWords, profanityWords)
corpus_sample <- tm_map(corpus_sample, stemDocument)
corpus_sample <- tm_map(corpus_sample, stripWhitespace)
require(RWeka)
## Loading required package: RWeka
GetNGramDataFrame <- function(corpus, ngram, spare) {
if (ngram == 1) {
tdm <- TermDocumentMatrix(corpus)
} else {
tdm <- TermDocumentMatrix(corpus,
control=list(tokenize=function(x){NGramTokenizer(x,
Weka_control(min=ngram, max=ngram))}))
}
tdm <- removeSparseTerms(tdm,spare)
nGramFrequency <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
wordFrequencyDF <- data.frame(word=names(nGramFrequency), count=nGramFrequency)
}
unigramDF <- GetNGramDataFrame(corpus_sample,1,0.99)
bigramDF <- GetNGramDataFrame(corpus_sample,2,0.99)
trigramDF <- GetNGramDataFrame(corpus_sample,3,0.99)
head(unigramDF,10)
## word count
## the the 47959
## and and 24315
## that that 11296
## for for 10823
## you you 9522
## with with 7205
## was was 6410
## have have 5848
## this this 5462
## but but 4897
head(bigramDF,10)
## word count
## of the of the 4265
## in the in the 4148
## to the to the 2241
## for the for the 1992
## on the on the 1921
## to be to be 1614
## at the at the 1497
## and the and the 1237
## in a in a 1185
## go to go to 1062
head(trigramDF,10)
## word count
## one of the one of the 359
## a lot of a lot of 313
## thank for the thank for the 249
## i want to i want to 201
## to be a to be a 196
## go to be go to be 178
## it was a it was a 151
## out of the out of the 147
## as well as as well as 145
## some of the some of the 144
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
p1 <- ggplot(data = head(unigramDF, n = 15), aes(reorder(word, -count), count)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Uni-Gram") + labs(title = "Uni-Gram Count")
p1
p2 <- ggplot(data = head(bigramDF, n = 15), aes(reorder(word, -count), count)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Bi-Gram") + labs(title = "Bi-Gram Count")
p2
p3 <- ggplot(data = head(trigramDF, n = 15), aes(reorder(word, -count), count)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Tri-Gram") + labs(title = "Tri-Gram Count")
p3
Loading and cleaning the dataset costs a lot of time. The processing is time consuming because of the huge file size of the dataset. By avoiding endless runtimes of the code, it was indispensable to create a data sample for text mining and tokenization.
In addition, removing words from dataset is also very time-consuming. It is better to save the cleaned dataset for later use. In the prediction application, we need to make more consideration of removed words. Why or why not include. For example, words like “the”, “it”, “this”, “were”, “was”, etc. might need to be excluded.
This exploratory analysis has allowed us to understand the distribution and relationship between the words, tokens, and phrases in the text.
Due to memory limit constraint, only a random subset has been included in the analysis (1% of each data source).
The next step will consist on rebuild n-grams models, up to 4-grams and building a predictive model that given a word or a phrase as input, it will try to predict the most likely next word.
The final model will be deployed on a shiny application, so the final user can easily interact with it.