Goals:

Exploratory Data Analysis Load and Clean Data
Create corpus
Plot the frequency of the most common words or 2 to 3 word pairs

Explortory Analysis

Load Data

setwd("F:/Coursera/Data Science Specialization/Capstone Course/final/en_US")

blog_file <- 'en_US.blogs.txt'
news_file <- 'en_US.news.txt'
twitter_file <- 'en_US.twitter.txt'

blog <- readLines(blog_file, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_file, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(news_file, encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)
blog_lenght <- length(blog)
news_lenght <- length(news)
twitter_length <- length(twitter)

cat("Length of Blog:", blog_lenght, "\n") 
## Length of Blog: 899288
cat("Length of News: ",news_lenght, "\n")  
## Length of News:  77259
cat("Length of Twitter: ", twitter_length, "\n")  
## Length of Twitter:  2360148

Sample Data and Create Corpus

library(tm)
## Warning: package 'tm' was built under R version 3.6.1
## Loading required package: NLP
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.1
set.seed(73)
blog_sample <- sample(blog, 3000, replace = FALSE)
news_sample <- sample(news, 3000, replace = FALSE)
twitter_sample <-sample(twitter, 3000, replace = FALSE)

sblog_lenght <- length(blog_sample)
snews_lenght <- length(news_sample)
stwitter_length <- length(twitter_sample)

cat("Length of Blog Sample:", sblog_lenght, "\n") 
## Length of Blog Sample: 3000
cat("Length of News Sample: ",snews_lenght, "\n")  
## Length of News Sample:  3000
cat("Length of Twitter Sample: ", stwitter_length, "\n")
## Length of Twitter Sample:  3000
data_sample <- c(blog_sample, news_sample, twitter_sample)

sdata_length <- length(data_sample)

cat("Length of Data Sample: ", sdata_length, "\n")
## Length of Data Sample:  9000
samp_data <- iconv(data_sample, "UTF-8", "ASCII", sub="")

corpus <- VCorpus(VectorSource(data_sample))

Clean Data

corpus <- tm_map(corpus, removePunctuation) # remove punctuations
corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, stripWhitespace) # remove extra whitespace
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove stopwords

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeURL))

# remove non English letters
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeNumPunct))

save(corpus, file = 'corpus.RData')

Plot n-grams

library(RWeka)
tokens1 <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tokens2 <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tokens3 <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

matrix1 <- TermDocumentMatrix(corpus, control = list(tokenize = tokens1))
matrix2 <- TermDocumentMatrix(corpus, control = list(tokenize = tokens2))
matrix3 <- TermDocumentMatrix(corpus, control = list(tokenize = tokens3))


# 1-ngram
m1 <- as.matrix(matrix1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
##        word freq
## said   said  855
## will   will  825
## one     one  761
## just   just  695
## can     can  675
## like   like  579
## time   time  561
## get     get  494
## new     new  466
## first first  412
library("wordcloud")
## Loading required package: RColorBrewer
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(10, "Paired"))

#2-ngram
m2 <- as.matrix(matrix2)
v2 <- sort(rowSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2, 10)
##                        word freq
## last year         last year   59
## new york           new york   45
## dont know         dont know   43
## years ago         years ago   43
## high school     high school   41
## right now         right now   38
## last week         last week   31
## can get             can get   30
## united states united states   30
## can see             can see   29
library("wordcloud")
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(10, "Dark2"), scale=c(3.5,0.25))

#3-ngram
m3 <- as.matrix(matrix3)
v3 <- sort(rowSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3, 10)
##                                            word freq
## president barack obama   president barack obama    7
## dont really know               dont really know    6
## new york city                     new york city    6
## wall street journal         wall street journal    5
## chief executive officer chief executive officer    4
## dont even know                   dont even know    4
## feel like im                       feel like im    4
## let us know                         let us know    4
## lot lot lot                         lot lot lot    4
## saturday night live         saturday night live    4
library("wordcloud")
wordcloud(words = d3$word, freq = d3$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(10, "Set1"), scale=c(3.5,0.25))

Next Steps

In this step I was just doing some basics, loading data, cleaning it up, and generating some baseline summaries. I will continue to play around with my sample size to make it as a large as possible and still function in the Shiny applet. Memory size does seem to be a concern with the large sample size.