library(knitr)
library(RColorBrewer)
library(stringi)
library(wordcloud2)
library(ggplot2)
library(ngram)
library(NLP)
library(tm)
library(slam)
library(xtable)
library(wordcloud)
library(dplyr)
data_dir <- '~/Data science course in coursera/Capstone/Capstone/en_US/'

fileInfo <- function(filePath, TextSource){
    
  
  fileSize  <- file.info(paste0(filePath))$size/1048576
                         
  conection <- file(filePath,'r')                     
  text      <- readLines(conection)
  nlines    <- length(text)
  
  
  maxline <- 0
  for (i in 1:nlines) {
    linelength <- nchar(text[i])
    if (linelength > maxline) { maxline <- linelength }
  }
  
  nwords <- sum(stri_count_words(text))
  
  df <- data.frame(
    TextSource,
    fileSize,
    nlines,
    maxline,
    nwords
  )
  
  close(conection)
  
  return(df)
  
}

BlogsConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.blogs.txt", "r")

NewsConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.news.txt", "r")

TwitterConnection <- file("~/Data science course in coursera/Capstone/Capstone/en_US/en_US.twitter.txt", "r")

The data comes from 3 text sources, blogs, news and twitter. The table below shows information about their structure.

##   TextSource fileSize  nlines maxline   nwords
## 1      Blogs 200.4242  899288   40835 38154238
## 2       News 196.2775   77259    5760  2693898
## 3    Twitter 159.3641 2360148     213 30218125

An initial analysis will be performed using 1000 lines sample.

blogs   <- readLines(BlogsConnection , 1000)
news    <- readLines(NewsConnection   , 1000)
twitter <- readLines(TwitterConnection, 1000)

corpus <- VCorpus(VectorSource(c(blogs, news, twitter)),
                  readerControl = list(readPlain,
                                       language="en",
                                       load=TRUE)) 

close(BlogsConnection)
close(NewsConnection )
close(TwitterConnection)

The next step is to use text mining techniques to clean and organize the data set.

Converting the document to lowercase, remove: punctuation marks, numbers, stopwords (i.e. “and”, “or”, “not”, “is”, etc), undesired terms, extra whitespaces.

corpus_lowercase               <- tm_map(corpus, content_transformer(tolower))
corpus_low_punct               <- tm_map(corpus_lowercase, removePunctuation)
corpus_low_punct_no            <- tm_map(corpus_low_punct, removeNumbers)
corpus_low_punct_no_stop       <- tm_map(corpus_low_punct_no, removeWords,stopwords("english"))
corpus_final <- tm_map(corpus_low_punct_no_stop, stripWhitespace)

To examine the data, we will produce a word clouds showing frequently used terms in the datasets. The word clouds show generally the top words with size varying by frequency.

wordcloud(corpus_final,
          max.words=75,
          random.order=TRUE,
          rot.per=.15,
          scale=c(3, .3))

The following eunigram shows the first 30 words with the highest frequency within the analyzed sample.

corpus_tdm        <- TermDocumentMatrix(corpus_final)
corpus_tdm_m      <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq <- sort(corpus_tdm_m_freq, decreasing = TRUE)

str(corpus_tdm_m_freq[1:30])
##  Named num [1:30] 304 259 254 249 248 191 191 186 171 144 ...
##  - attr(*, "names")= chr [1:30] "said" "will" "one" "just" ...
# ggplot(corpus_tdm_m_freq[1:30],
#         col = "Blue",
#         las = 2,
#         main = "Word Frequency of the data")
df <- data.frame(
  Word = names(corpus_tdm_m_freq[1:30]),
  Frequency = corpus_tdm_m_freq[1:30]
  )

df
##          Word Frequency
## said     said       304
## will     will       259
## one       one       254
## just     just       249
## like     like       248
## can       can       191
## time     time       191
## new       new       186
## get       get       171
## know     know       144
## day       day       141
## now       now       137
## good     good       131
## first   first       128
## people people       124
## much     much       122
## year     year       120
## make     make       112
## also     also       110
## two       two       106
## dont     dont       104
## love     love       102
## last     last        99
## really really        99
## see       see        99
## right   right        97
## think   think        97
## well     well        95
## going   going        93
## got       got        93
ggplot( df %>% arrange(Frequency),
        aes(x = reorder(Word,-Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  labs(title = "Unigrams", x = "Words") + 
  theme(axis.text.x = element_text(angle = 90))