library(tm)
library (knitr)
library (dplyr)
library (stringi)
library (ggplot2)
library (wordcloud)
library (SnowballC)
News <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.news.txt"),
encoding = "UTF-8",
skipNul = TRUE)
close(con)
Blog <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.blogs.txt"),
encoding = "UTF-8",
skipNul = TRUE)
close(con)
Twitter <- readLines(con <- file("C:/Users/alfre/Google Drive/Inteligencia Tecnica/En proceso/Data Science Specialization/10-Data Science Capstone/dataset and links/final/en_US/en_US.twitter.txt"),
encoding = "UTF-8",
skipNul = TRUE)
close(con)
I compute the size, numbers of entries, total character and length of longest entry.
summary <- data.frame('File' = c("Blog","News","Twitter"),
"File Size" = sapply(list(Blog, News, Twitter), function(x){format(object.size(x),"MB")}),
'Nentries' = sapply(list(Blog, News, Twitter), function(x){length(x)}),
'TotalCharacters' = sapply(list(Blog, News, Twitter), function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(Blog, News, Twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
summary
## File File.Size Nentries TotalCharacters MaxCharacters
## 1 Blog 248.5 Mb 899288 206824505 40833
## 2 News 19.2 Mb 77259 15639408 5760
## 3 Twitter 301.4 Mb 2360148 162096241 140
Twitter file is the biggest file given a 2360148 number of entries in total and 301.4 Mb. However, the blog file have the biggest max characters with 40833 in total.
The data are so big, therefore I’m only going to proceed with a 5% subset. Then I’m going to clean the data and convert to a corpus.
set.seed(1015) # Setting the seed for reproducibility
samp_size <- 0.05 # Setting the subset to be 5% of each file
# Create indices for the sampling of the datasets.
blogs_ind <- sample(seq_len(length(Blog)),length(Blog)*samp_size)
news_ind <- sample(seq_len(length(News)),length(News)*samp_size)
twitter_ind <- sample(seq_len(length(Twitter)),length(Twitter)*samp_size)
# Now select the 5% of the sample for each file.
blogs_sub <- News[blogs_ind[]]
news_sub <- News[news_ind[]]
twitter_sub <- Twitter[twitter_ind[]]
# Now load the text mining package from R
# Create a corpus out of all 3 sampled datasets.
corpus <- Corpus(VectorSource(c(blogs_sub,news_sub,twitter_sub)),
readerControl=list(reader=readPlain,language="en"))
# Clean the corpus dataset by removing non-ASCII characters.
corpus <- Corpus(VectorSource(sapply(corpus, function(row) iconv(row,"latin1","ASCII",sub=""))))
# Clean the data further by removing punctuation, unnecessary white spaces, and numbers, and
# converting to lower case.
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
# corpus <- tm_map(corpus, PlainTextDocument)
library(tm)
Now that we have a clean dataset we need to convert it to a format that is most useful for Natural Language Prpcessing (NLP). The format of choice are N-grams stored in Term Document Matrices (TDM). The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram which is based on individual words. The bigram is based on pairs of to words and so on. The TDMs store the frequencies of the N-grams in the respective sources.
library(RWeka) # Weka is a collection of machine learning algorithms for data mining
## Warning: package 'RWeka' was built under R version 3.4.4
# I tried to solve this with a single function that takes 2 arguments instead of 4 separate functions but could not get it to work.
TokenUnigram <- function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
TokenBigram <- function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
TokenTrigram <- function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
TokenQuadgram <- function(x) NGramTokenizer(x,Weka_control(min=4,max=4))
Unigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenUnigram))
Bigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenBigram))
Trigram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenTrigram))
Quadgram <- TermDocumentMatrix(corpus, control=list(tokenize=TokenQuadgram))
Unigram
## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity : 100%
## Maximal term length: 109
## Weighting : term frequency (tf)
Bigram
## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity : 100%
## Maximal term length: 109
## Weighting : term frequency (tf)
Trigram
## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity : 100%
## Maximal term length: 109
## Weighting : term frequency (tf)
Quadgram
## <<TermDocumentMatrix (terms: 77935, documents: 166833)>>
## Non-/sparse entries: 1266347/13000863508
## Sparsity : 100%
## Maximal term length: 109
## Weighting : term frequency (tf)
The above matrices are extremely sparse (i.e. they are almost entirely cmposed of zeroes). We need to create a denser matrices to do exploratory analyses and remove rare N-grams.
# I write a function that sum up rows and sort by N-gram frequency.
freqframe <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freqframe <- data.frame(word=names(freq), freq=freq)
return(freqframe)
}
# Create matrices that are denser, and then add up and sort the matrices.
UnigramDense <- removeSparseTerms(Unigram, 0.999)
UnigramDenseSort <- freqframe(UnigramDense)
BigramDense <- removeSparseTerms(Bigram, 0.999)
BigramDenseSort <- freqframe(BigramDense)
TrigramDense <- removeSparseTerms(Trigram, 0.999)
TrigramDenseSort <- freqframe(TrigramDense)
QuadgramDense <- removeSparseTerms(Quadgram, 0.999)
QuadgramDenseSort <- freqframe(QuadgramDense)
# Plot the frequencies of the Unigrams.
GGUni <- ggplot(data=UnigramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGUni <- GGUni + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Unigrams (individual words)")
GGUni <- GGUni + theme(axis.text.x=element_text(angle=90))
GGUni
# Plot the frequencies of the Trigrams.
GGTri <- ggplot(data=TrigramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGTri <- GGTri + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Trigrams (triplets of words)")
GGTri <- GGTri + theme(axis.text.x=element_text(angle=90))
GGTri
# Plot the frequencies of the Quadgrams.
GGQuad <- ggplot(data=QuadgramDenseSort[1:50,],aes(x=reorder(word, -freq),y=freq)) + geom_bar(stat="identity")
GGQuad <- GGQuad + labs(x="N-gram", y="Frequency", title="Frequencies of the 50 Most Abundant Quadgrams (quartets of words)")
GGQuad <- GGQuad + theme(axis.text.x=element_text(angle=90))
GGQuad
It can be seen that many of the most frequent words are “stopword”. An idea of what the most frequent words without them can be achieved by removing them. Let’s plot them on a cloud chart:
wo_stopw_corpus<-tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
wordcloud(wo_stopw_corpus, max.words=50, random.order=FALSE, colors=brewer.pal(8,"Greens"))
If I plot whitpuh remove words, the results are:
wordcloud(corpus, max.words=50, random.order=FALSE, colors=brewer.pal(8,"Greens"))