Reading text from my folder

library(tm)
library (knitr)
library (dplyr)
library (stringi)
library (ggplot2)
library (wordcloud)
library (SnowballC)
News <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.news.txt"), 
                  encoding = "UTF-8", 
                  skipNul = TRUE)
close(con)
Blog <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.blogs.txt"), 
                  encoding = "UTF-8", 
                  skipNul = TRUE)
close(con)
Twitter <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.twitter.txt"), 
                     encoding = "UTF-8", 
                     skipNul = TRUE)
close(con)

Generate summary stats for the text data

I compute the size, numbers of entries, total character and length of longest entry.

summary <- data.frame('File' = c("Blog","News","Twitter"),
                      "File Size" = sapply(list(Blog, News, Twitter), function(x){format(object.size(x),"MB")}),
                      'Nentries' = sapply(list(Blog, News, Twitter), function(x){length(x)}),
                      'TotalCharacters' = sapply(list(Blog, News, Twitter), function(x){sum(nchar(x))}),
                      'MaxCharacters' = sapply(list(Blog, News, Twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
                      )
summary

##      File File.Size Nentries TotalCharacters MaxCharacters
## 1    Blog  248.5 Mb   899288       206824505         40833
## 2    News   19.2 Mb    77259        15639408          5760
## 3 Twitter  301.4 Mb  2360148       162096241           140

Twitter file is the biggest file given a 2360148 number of entries in total and 301.4 Mb. However, the blog file have the biggest max characters with 40833 in total.

Data Cleaning and selection of Corpus

The data are so big, therefore I’m only going to proceed with a 5% subset. Then I’m going to clean the data and convert to a corpus.

set.seed(1015) # Setting the seed for reproducibility
samp_size <- 0.05 # Setting the subset to be 5% of each file

# Create indices for the sampling of the datasets.
blogs_ind <- sample(seq_len(length(Blog)),length(Blog)*samp_size)
news_ind <- sample(seq_len(length(News)),length(News)*samp_size)
twitter_ind <- sample(seq_len(length(Twitter)),length(Twitter)*samp_size)

# Now select the 5% of the sample for each file.
blogs_sub <- News[blogs_ind[]]
news_sub <- News[news_ind[]]
twitter_sub <- Twitter[twitter_ind[]]

# Now load the text mining package from R

# Create a corpus out of all 3 sampled datasets.
corpus <- Corpus(VectorSource(c(blogs_sub,news_sub,twitter_sub)),
                 readerControl=list(reader=readPlain,language="en"))

# Clean the corpus dataset by removing non-ASCII characters.
corpus <- Corpus(VectorSource(sapply(corpus, function(row) iconv(row,"latin1","ASCII",sub=""))))

# Clean the data further by removing punctuation, unnecessary white spaces, and numbers, and
# converting to lower case.
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
# corpus <- tm_map(corpus, PlainTextDocument) 
library(tm)

N-grams

Now that we have a clean dataset we need to convert it to a format that is most useful for Natural Language Prpcessing (NLP). The format of choice are N-grams stored in Term Document Matrices (TDM). The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram which is based on individual words. The bigram is based on pairs of to words and so on. The TDMs store the frequencies of the N-grams in the respective sources.

library(RWeka) # Weka is a collection of machine learning algorithms for data mining

## Warning: package 'RWeka' was built under R version 3.4.4

# I tried to solve this with a single function that takes 2 arguments instead of 4 separate functions but could not get it to work.
TokenUnigram <- function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
TokenBigram <- function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
TokenTrigram <- function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
TokenQuadgram <- function(x) NGramTokenizer(x,Weka_control(min=4,max=4))

Unigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenUnigram))
Bigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenBigram))
Trigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenTrigram))
Quadgram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenQuadgram))

Unigram

## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity           : 100%
## Maximal term length: 109
## Weighting          : term frequency (tf)

Bigram

## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity           : 100%
## Maximal term length: 109
## Weighting          : term frequency (tf)

Trigram

## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity           : 100%
## Maximal term length: 109
## Weighting          : term frequency (tf)

Quadgram

## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity           : 100%
## Maximal term length: 109
## Weighting          : term frequency (tf)

Exploratory data analysis

The above matrices are extremely sparse (i.e. they are almost entirely cmposed of zeroes). We need to create a denser matrices to do exploratory analyses and remove rare N-grams.

# I write a function that sum up rows and sort by N-gram frequency.
freqframe <- function(tdm) {
    freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
    freqframe <- data.frame(word=names(freq), freq=freq)
    return(freqframe)
}

# Create matrices that are denser, and then add up and sort the matrices.
UnigramDense <- removeSparseTerms(Unigram, 0.999)
UnigramDenseSort <- freqframe(UnigramDense)
BigramDense <- removeSparseTerms(Bigram, 0.999)
BigramDenseSort <- freqframe(BigramDense)
TrigramDense <- removeSparseTerms(Trigram, 0.999)
TrigramDenseSort <- freqframe(TrigramDense)
QuadgramDense <- removeSparseTerms(Quadgram, 0.999)
QuadgramDenseSort <- freqframe(QuadgramDense)

# Plot the frequencies of the Unigrams.
GGUni <- ggplot(data=UnigramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGUni <- GGUni + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Unigrams (individual words)")
GGUni <- GGUni + theme(axis.text.x=element_text(angle=90))
GGUni

# Plot the frequencies of the Trigrams.
GGTri <- ggplot(data=TrigramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGTri <- GGTri + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Trigrams (triplets of words)")
GGTri <- GGTri + theme(axis.text.x=element_text(angle=90))
GGTri

# Plot the frequencies of the Quadgrams.
GGQuad <- ggplot(data=QuadgramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGQuad <- GGQuad + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Quadgrams (quartets of words)")
GGQuad <- GGQuad + theme(axis.text.x=element_text(angle=90))
GGQuad

Wordcloud

It can be seen that many of the most frequent words are “stopword”. An idea of what the most frequent words without them can be achieved by removing them. Let’s plot them on a cloud chart:

wo_stopw_corpus<-tm_map(corpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

wordcloud(wo_stopw_corpus, max.words=50, random.order=FALSE, colors=brewer.pal(8,"Greens"))

If I plot whitpuh remove words, the results are:

wordcloud(corpus, max.words=50, random.order=FALSE, colors=brewer.pal(8,"Greens"))

Milestone report