library(tm)
library(rJava)
library(RWeka)
library(SnowballC)
library(ggplot2)
library (ngram)
library(dplyr)
library(doParallel)
library(stringi)
library(stringr)
library(data.table)
library(gdata)
library(slam)
library(wordcloud)
library(Matrix)
library(MatrixModels)
library(qdap)

Load the Data

conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt", open = "rb")
source.blogs <- readLines(conn, encoding = "UTF-8")
close(conn)

conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt", open = "rb")
source.news <- readLines(conn, encoding = "UTF-8")
close(conn)

conn <- file("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt", open = "rb")
source.twitter <- readLines(conn, encoding = "UTF-8")
close(conn)

rm(conn)

sample.fun <- function(data, percent)
{
    return(data[as.logical(rbinom(length(data),1,percent))])
}

en.US.twitter <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt")
en.US.blogs <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt")
en.US.news <- readLines("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt")

Sumarize the Data

twitter.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.twitter.txt")$size/(1024^2)
blogs.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.blogs.txt")$size/(1024^2)
news.filesize <- file.info("~/Desktop/Data Science Specialization/data/final/en_US/en_US.news.txt")$size/(1024^2)

twitter.length <- length(en.US.twitter)
blogs.length <- length(en.US.blogs)
news.length <- length(en.US.news)

twitter.char <- sum(nchar(en.US.twitter))
blogs.char <- sum(nchar(en.US.blogs))
news.char <- sum(nchar(en.US.news))

# To expedite calculation time we only calculated word counts for the first 5,000 entries
twitter.words <- sum(word_count(en.US.twitter[1:5000]))/5000*twitter.length
blogs.words <- (sum(word_count(en.US.blogs[1:607])) + sum(word_count(en.US.blogs[609:2430])) + sum(word_count(en.US.blogs[2433:3274])) + sum(word_count(en.US.blogs[3276:5000])))/4997*blogs.length
news.words <- sum(word_count(en.US.news[1:5000]))/5000*news.length

twitter.filesize 
## [1] 159.3641
blogs.filesize 
## [1] 200.4242
news.filesize 
## [1] 196.2775
twitter.length 
## [1] 2360148
blogs.length 
## [1] 899288
news.length 
## [1] 1010242
twitter.char 
## [1] 162096031
blogs.char 
## [1] 206824505
news.char 
## [1] 203223159
twitter.words 
## [1] 29132723
blogs.words 
## [1] 36472904
news.words 
## [1] 33617217

Create Corpus and Remove Profanity from Dataset

source.blogs <- iconv(source.blogs, "latin1", "ASCII", sub="")
source.news <- iconv(source.news, "latin1", "ASCII", sub="")
source.twitter <- iconv(source.twitter, "latin1", "ASCII", sub="")
percentage <- 0.1

sample.blogs   <- sample.fun(source.blogs, percentage)
sample.news   <- sample.fun(source.news, percentage)
sample.twitter   <- sample.fun(source.twitter, percentage)
dir.create("sample", showWarnings = FALSE)

write(sample.blogs, "sample/sample.blogs.txt")
write(sample.news, "sample/sample.news.txt")
write(sample.twitter, "sample/sample.twitter.txt")

remove(source.blogs)
remove(source.news)
remove(source.twitter)
sample.corpus <- c(sample.blogs,sample.news,sample.twitter)
my.corpus <- Corpus(VectorSource(list(sample.corpus)))

my.corpus <- tm_map(my.corpus, content_transformer(tolower))
my.corpus <- tm_map(my.corpus, removePunctuation)
my.corpus <- tm_map(my.corpus, removeNumbers)
my.corpus <- tm_map(my.corpus, removeWords, stopwords("english"))
my.corpus <- tm_map(my.corpus, stripWhitespace)

Exploratory Analysis through nGrams of Most Common Words

plotNGram <- function(n) {
  options(mc.cores=1)
  
  # builds n-gram tokenizer and term document matrix
  tk <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
  tdm <- TermDocumentMatrix(my.corpus, control=list(tokenize=tk))
  
  # find 25 most frequent n-grams in the matrix
  ngram <- as.matrix(rollup(tdm, 2, na.rm=TRUE, FUN=sum))
  ngram <- data.frame(word=rownames(ngram), freq=ngram[,1])
  ngram <- ngram[order(-ngram$freq), ][1:25, ]
  ngram$word <- factor(ngram$word, as.character(ngram$word))
  
  # plots
  ggplot(ngram, aes(x=word, y=freq)) + ggtitle("Frequency of Words") + geom_bar(stat="Identity", fill="#ED9626", color="#855415") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Word(s)") + ylab("Frequency")
}

plotNGram(1)

Plans for future

Now that we have performed some exploratory analysis, we are ready to start building the predictive model(s) and eventually the data product. Below are high-level plans to achieve this goal: * Using N-grams to generate tokens of one to four words * Summarizing frequency of tokens and find association between tokens. * Building predictive model(s) using the tokens * Develop data product (i.e. shiny app) to make word recommendation (i.e. prediction) based on user inputs.