library(tm)
library(rJava)
library(RWeka)
library(SnowballC)
library(ggplot2)
library (ngram)
library(dplyr)
library(doParallel)
library(stringi)
library(stringr)
library(data.table)
library(gdata)
library(slam)
library(wordcloud)
library(Matrix)
library(MatrixModels)
library(qdap)
conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt", open = "rb")
source.blogs <- readLines(conn, encoding = "UTF-8")
close(conn)
conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt", open = "rb")
source.news <- readLines(conn, encoding = "UTF-8")
close(conn)
conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt", open = "rb")
source.twitter <- readLines(conn, encoding = "UTF-8")
close(conn)
rm(conn)
sample.fun <- function(data, percent)
{
return(data[as.logical(rbinom(length(data),1,percent))])
}
en.US.twitter <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt")
en.US.blogs <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt")
en.US.news <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt")
twitter.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt")$size/(1024^2)
blogs.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt")$size/(1024^2)
news.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt")$size/(1024^2)
twitter.length <- length(en.US.twitter)
blogs.length <- length(en.US.blogs)
news.length <- length(en.US.news)
twitter.char <- sum(nchar(en.US.twitter))
blogs.char <- sum(nchar(en.US.blogs))
news.char <- sum(nchar(en.US.news))
# To expedite calculation time we only calculated word counts for the first 5,000 entries
twitter.words <- sum(word_count(en.US.twitter[1:5000]))/5000*twitter.length
blogs.words <- (sum(word_count(en.US.blogs[1:607])) + sum(word_count(en.US.blogs[609:2430])) + sum(word_count(en.US.blogs[2433:3274])) + sum(word_count(en.US.blogs[3276:5000])))/4997*blogs.length
news.words <- sum(word_count(en.US.news[1:5000]))/5000*news.length
twitter.filesize
## [1] 159.3641
blogs.filesize
## [1] 200.4242
news.filesize
## [1] 196.2775
twitter.length
## [1] 2360148
blogs.length
## [1] 899288
news.length
## [1] 1010242
twitter.char
## [1] 162096031
blogs.char
## [1] 206824505
news.char
## [1] 203223159
twitter.words
## [1] 29132723
blogs.words
## [1] 36472904
news.words
## [1] 33617217
source.blogs <- iconv(source.blogs, "latin1", "ASCII", sub="")
source.news <- iconv(source.news, "latin1", "ASCII", sub="")
source.twitter <- iconv(source.twitter, "latin1", "ASCII", sub="")
percentage <- 0.1
sample.blogs <- sample.fun(source.blogs, percentage)
sample.news <- sample.fun(source.news, percentage)
sample.twitter <- sample.fun(source.twitter, percentage)
dir.create("sample", showWarnings = FALSE)
write(sample.blogs, "sample/sample.blogs.txt")
write(sample.news, "sample/sample.news.txt")
write(sample.twitter, "sample/sample.twitter.txt")
remove(source.blogs)
remove(source.news)
remove(source.twitter)
sample.corpus <- c(sample.blogs,sample.news,sample.twitter)
my.corpus <- Corpus(VectorSource(list(sample.corpus)))
my.corpus <- tm_map(my.corpus, content_transformer(tolower))
my.corpus <- tm_map(my.corpus, removePunctuation)
my.corpus <- tm_map(my.corpus, removeNumbers)
my.corpus <- tm_map(my.corpus, removeWords, stopwords("english"))
my.corpus <- tm_map(my.corpus, stripWhitespace)
plotNGram <- function(n) {
options(mc.cores=1)
# builds n-gram tokenizer and term document matrix
tk <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
tdm <- TermDocumentMatrix(my.corpus, control=list(tokenize=tk))
# find 25 most frequent n-grams in the matrix
ngram <- as.matrix(rollup(tdm, 2, na.rm=TRUE, FUN=sum))
ngram <- data.frame(word=rownames(ngram), freq=ngram[,1])
ngram <- ngram[order(-ngram$freq), ][1:25, ]
ngram$word <- factor(ngram$word, as.character(ngram$word))
# plots
ggplot(ngram, aes(x=word, y=freq)) + ggtitle("Frequency of Words") + geom_bar(stat="Identity", fill="#ED9626", color="#855415") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Word(s)") + ylab("Frequency")
}
plotNGram(1)
Now that we have performed some exploratory analysis, we are ready to start building the predictive model(s) and eventually the data product. Below are high-level plans to achieve this goal: * Using N-grams to generate tokens of one to four words * Summarizing frequency of tokens and find association between tokens. * Building predictive model(s) using the tokens * Develop data product (i.e. shiny app) to make word recommendation (i.e. prediction) based on user inputs.