Exploration of the Coursera-SwiftKey data

R Markdown

This is an exploration of the data from Blogs, News and Twitter in english for the coursera john hopkins capstone proyect on the data science spetialization. For sources refer to the week2 data sources on the spetialization. I decide to leave the whole script exposed, because I had some troubles to make the analysis on quadrants work and i hope to have some feedback on that sooner or later. Altough the quadrants do not appear on the end as shownd results, all the work done should be sufficient to aprove the weekly peer graded assignment.

Loading the data and packages

#Setting working directory and seed
setwd("C:/rtests")
set.seed(10000)
library(wordcloud)

## Loading required package: RColorBrewer

library(tm)

## Loading required package: NLP

library(slam)
library(RWeka)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(ngram)


#Read blogs, news and twiter txt files from working directory
p1 <- "./en_US.blogs.txt"
p2 <- "./en_US.news.txt"
p3 <- "./en_US.twitter.txt"
xtemp <- file(p1, open="rb"); us_blogs <- readLines(xtemp, encoding="UTF-8"); close(xtemp)
xtemp <- file(p2, open="rb"); us_news <- readLines(xtemp, encoding="UTF-8"); close(xtemp)
xtemp <- file(p3, open="rb"); us_twitter <- readLines(xtemp, encoding="UTF-8"); close(xtemp)

## Warning in readLines(xtemp, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul

## Warning in readLines(xtemp, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul

## Warning in readLines(xtemp, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul

## Warning in readLines(xtemp, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul

rm(xtemp)
rm(p1)
rm(p2)
rm(p3)

Word Clouds

Blogs wordcloud

wordcloud(us_blogs, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

News wordcloud

wordcloud(us_news, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Twitter wordcloud

wordcloud(us_twitter, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Data table

The following data table contains the original file size of each .txt , the amount of lines, the amount of words and the love/Hate ratio of those words (love and hate)

datatable <- data.frame ('File' = c("Blogs","News","Twitter"),
                         
                        "FileSize" = sapply(list(us_blogs, us_news, us_twitter),
                        function(x){format(object.size(x),"MB")}),
                        
                        'Lines' = sapply(list(us_blogs, us_news, us_twitter),
                        function(x){length(x)}),
                        
                        'WordCount' = sapply(list(us_blogs, us_news, us_twitter),
                        function(x){wordcount(x)}),
                        
                        'LoveHateRatio' = sapply(list(us_blogs, us_news, us_twitter),
                        function(x){(length(grep("love", x)))/(length(grep("hate", x)))})
)
  
datatable

##      File FileSize   Lines WordCount LoveHateRatio
## 1   Blogs 255.4 Mb  899288  37334131      4.430258
## 2    News 257.3 Mb 1010242  34372530      3.267445
## 3 Twitter   319 Mb 2360148  30373543      4.108592

It seems there is less love in news than other places

Subsampling to 1% of the total data and cleaning it for histograms

#Sub sampling of the data to 1%
us_blogs_sample <- sample(seq_len(length(us_blogs)),length(us_blogs)*0.01)
us_news_sample <- sample(seq_len(length(us_news)),length(us_news)*0.01)
us_twitter_sample <- sample(seq_len(length(us_twitter)),length(us_twitter)*0.01)

us_blogs_0.01 <- us_blogs[us_blogs_sample[]]
us_news_0.01 <- us_news[us_news_sample[]]
us_twitter_0.01 <- us_twitter[us_twitter_sample[]]

rm(us_blogs_sample)
rm(us_news_sample)
rm(us_twitter_sample)
rm(us_blogs)
rm(us_news)
rm(us_twitter)
rm(datatable)

us_blogs_c <- Corpus(VectorSource(c(us_blogs_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
us_news_c <- Corpus(VectorSource(c(us_news_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
us_twitter_c <- Corpus(VectorSource(c(us_twitter_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
#rm(us_blogs_0.01)
#rm(us_news_0.01)
#rm(us_twitter_0.01)

us_blogs_c <- Corpus(VectorSource(sapply(us_blogs_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII
us_news_c <- Corpus(VectorSource(sapply(us_news_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII
us_twitter_c <- Corpus(VectorSource(sapply(us_twitter_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII

us_blogs_c <- tm_map(us_blogs_c, removePunctuation) # Remove punctuation

## Warning in tm_map.SimpleCorpus(us_blogs_c, removePunctuation):
## transformation drops documents

us_blogs_c <- tm_map(us_blogs_c, stripWhitespace) # Remove white spaces

## Warning in tm_map.SimpleCorpus(us_blogs_c, stripWhitespace): transformation
## drops documents

us_blogs_c <- tm_map(us_blogs_c, content_transformer(tolower)) # Convert to lowercase

## Warning in tm_map.SimpleCorpus(us_blogs_c, content_transformer(tolower)):
## transformation drops documents

us_blogs_c <- tm_map(us_blogs_c, removeNumbers) # Remove numbers

## Warning in tm_map.SimpleCorpus(us_blogs_c, removeNumbers): transformation
## drops documents

us_news_c <- tm_map(us_news_c, removePunctuation) # Remove punctuation

## Warning in tm_map.SimpleCorpus(us_news_c, removePunctuation):
## transformation drops documents

us_news_c <- tm_map(us_news_c, stripWhitespace) # Remove white spaces

## Warning in tm_map.SimpleCorpus(us_news_c, stripWhitespace): transformation
## drops documents

us_news_c <- tm_map(us_news_c, content_transformer(tolower)) # Convert to lowercase

## Warning in tm_map.SimpleCorpus(us_news_c, content_transformer(tolower)):
## transformation drops documents

us_news_c <- tm_map(us_news_c, removeNumbers) # Remove numbers

## Warning in tm_map.SimpleCorpus(us_news_c, removeNumbers): transformation
## drops documents

us_twitter_c <- tm_map(us_twitter_c, removePunctuation) # Remove punctuation

## Warning in tm_map.SimpleCorpus(us_twitter_c, removePunctuation):
## transformation drops documents

us_twitter_c <- tm_map(us_twitter_c, stripWhitespace) # Remove white spaces

## Warning in tm_map.SimpleCorpus(us_twitter_c, stripWhitespace):
## transformation drops documents

us_twitter_c <- tm_map(us_twitter_c, content_transformer(tolower)) # Convert to lowercase

## Warning in tm_map.SimpleCorpus(us_twitter_c, content_transformer(tolower)):
## transformation drops documents

us_twitter_c <- tm_map(us_twitter_c, removeNumbers) # Remove numbers

## Warning in tm_map.SimpleCorpus(us_twitter_c, removeNumbers): transformation
## drops documents

#us_twitter_c <- tm_map(us_twitter_c, function(x) iconv(enc2utf8(x), sub = "byte"))
     

# Setting amount of n-grams
OneWordgram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
Fourwordsgram <- function(y) NGramTokenizer(y, Weka_control(min = 4, max = 4))

OneWord_blogs <- TermDocumentMatrix(us_blogs_c, control = list(tokenize = OneWordgram))
Fourwords_blogs <- TermDocumentMatrix(us_blogs_c, control = list(tokenize = Fourwordsgram))

OneWord_news <- TermDocumentMatrix(us_news_c, control = list(tokenize = OneWordgram))
Fourwords_news <- TermDocumentMatrix(us_news_c, control = list(tokenize = Fourwordsgram))

OneWord_twitter <- TermDocumentMatrix(us_twitter_c, control = list(tokenize = OneWordgram))
Fourwords_twitter <- TermDocumentMatrix(us_twitter_c, control = list(tokenize = Fourwordsgram))

# Sum of rows and sorting by n-gram frequency
freq_ngram <- function(tdm){
  freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
  freq_ngram <- data.frame(word=names(freq), freq=freq)
  return(freq_ngram)
}

# Make matrices more dense, add up and sort
OneWord_blogs2 <- removeSparseTerms(OneWord_blogs, 0.999)
OneWord_blogs3 <- freq_ngram(OneWord_blogs2)
Fourwords_blogs2 <- removeSparseTerms(Fourwords_blogs, 0.9999)
Fourwords_blogs3 <- freq_ngram(Fourwords_blogs2)

OneWord_news2 <- removeSparseTerms(OneWord_news, 0.999)
OneWord_news3 <- freq_ngram(OneWord_news2)
Fourwords_news2 <- removeSparseTerms(Fourwords_news, 0.9999)
Fourwords_news3 <- freq_ngram(Fourwords_news2)

OneWord_twitter2 <- removeSparseTerms(OneWord_twitter, 0.999)
OneWord_twitter3 <- freq_ngram(OneWord_twitter2)
Fourwords_twitter2 <- removeSparseTerms(Fourwords_twitter, 0.9999)
Fourwords_twitter3 <- freq_ngram(Fourwords_twitter2)

One word count histogram for Blogs

#Histogram top 25 most used words - BLOGS
OWB <- ggplot(data = OneWord_blogs3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWB <- OWB + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in BLOGS")
OWB <- OWB + theme(axis.text.x=element_text(angle=90))
OWB

One word count histogram for News

#Histogram top 25 most used words - NEWS
OWN <- ggplot(data = OneWord_news3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWN <- OWN + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in NEWS")
OWN <- OWN + theme(axis.text.x=element_text(angle=90))
OWN

One word count histogram for Twitter

#Histogram top 25 most used words - TWITTER
OWT <- ggplot(data = OneWord_twitter3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWT <- OWT + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in TWITTER")
OWT <- OWT + theme(axis.text.x=element_text(angle=90))
OWT

Code not working for Quadrants

For some reason for Quadrants (group of 4 words) I could not make it work, even after spending hours searching on the web and the documentation I could not figure out why it did not work, so if anybody as any idea i ll appreciate suggestions, i ll leave the code for quadrants histograms here below.

#Histogram of the for quartets of words most used - BLOGS
FWB <- ggplot(data = Fourwords_blogs3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWB <- FWB + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in BLOGS")
FWB <- FWB + theme(axis.text.x=element_text(angle=90))
FWB

#Histogram of the for quartets of words most used - NEWS
FWN <- ggplot(data = Fourwords_news3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWN <- FWN + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in NEWS")
FWN <- FWN + theme(axis.text.x=element_text(angle=90))
FWN

#Histogram of the for quartets of words most used - TWITTER
FWN <- ggplot(data = Fourwords_twitter3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWN <- FWN + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in TWITTER")
FWN <- FWN + theme(axis.text.x=element_text(angle=90))
FWN

Thanks for reviewing! JPL