This is an exploration of the data from Blogs, News and Twitter in english for the coursera john hopkins capstone proyect on the data science spetialization. For sources refer to the week2 data sources on the spetialization. I decide to leave the whole script exposed, because I had some troubles to make the analysis on quadrants work and i hope to have some feedback on that sooner or later. Altough the quadrants do not appear on the end as shownd results, all the work done should be sufficient to aprove the weekly peer graded assignment.
#Setting working directory and seed
setwd("C:/rtests")
set.seed(10000)
library(wordcloud)
## Loading required package: RColorBrewer
library(tm)
## Loading required package: NLP
library(slam)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ngram)
#Read blogs, news and twiter txt files from working directory
p1 <- "./en_US.blogs.txt"
p2 <- "./en_US.news.txt"
p3 <- "./en_US.twitter.txt"
xtemp <- file(p1, open="rb"); us_blogs <- readLines(xtemp, encoding="UTF-8"); close(xtemp)
xtemp <- file(p2, open="rb"); us_news <- readLines(xtemp, encoding="UTF-8"); close(xtemp)
xtemp <- file(p3, open="rb"); us_twitter <- readLines(xtemp, encoding="UTF-8"); close(xtemp)
## Warning in readLines(xtemp, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(xtemp, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(xtemp, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(xtemp, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
rm(xtemp)
rm(p1)
rm(p2)
rm(p3)
wordcloud(us_blogs, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
wordcloud(us_news, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
wordcloud(us_twitter, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
The following data table contains the original file size of each .txt , the amount of lines, the amount of words and the love/Hate ratio of those words (love and hate)
datatable <- data.frame ('File' = c("Blogs","News","Twitter"),
"FileSize" = sapply(list(us_blogs, us_news, us_twitter),
function(x){format(object.size(x),"MB")}),
'Lines' = sapply(list(us_blogs, us_news, us_twitter),
function(x){length(x)}),
'WordCount' = sapply(list(us_blogs, us_news, us_twitter),
function(x){wordcount(x)}),
'LoveHateRatio' = sapply(list(us_blogs, us_news, us_twitter),
function(x){(length(grep("love", x)))/(length(grep("hate", x)))})
)
datatable
## File FileSize Lines WordCount LoveHateRatio
## 1 Blogs 255.4 Mb 899288 37334131 4.430258
## 2 News 257.3 Mb 1010242 34372530 3.267445
## 3 Twitter 319 Mb 2360148 30373543 4.108592
It seems there is less love in news than other places
#Sub sampling of the data to 1%
us_blogs_sample <- sample(seq_len(length(us_blogs)),length(us_blogs)*0.01)
us_news_sample <- sample(seq_len(length(us_news)),length(us_news)*0.01)
us_twitter_sample <- sample(seq_len(length(us_twitter)),length(us_twitter)*0.01)
us_blogs_0.01 <- us_blogs[us_blogs_sample[]]
us_news_0.01 <- us_news[us_news_sample[]]
us_twitter_0.01 <- us_twitter[us_twitter_sample[]]
rm(us_blogs_sample)
rm(us_news_sample)
rm(us_twitter_sample)
rm(us_blogs)
rm(us_news)
rm(us_twitter)
rm(datatable)
us_blogs_c <- Corpus(VectorSource(c(us_blogs_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
us_news_c <- Corpus(VectorSource(c(us_news_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
us_twitter_c <- Corpus(VectorSource(c(us_twitter_0.01)), readerControl=list(reader=readPlain,language="en")) # Make corpus
#rm(us_blogs_0.01)
#rm(us_news_0.01)
#rm(us_twitter_0.01)
us_blogs_c <- Corpus(VectorSource(sapply(us_blogs_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII
us_news_c <- Corpus(VectorSource(sapply(us_news_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII
us_twitter_c <- Corpus(VectorSource(sapply(us_twitter_c, function(row) iconv(row, "latin1", "ASCII", sub="")))) # Remove non-ASCII
us_blogs_c <- tm_map(us_blogs_c, removePunctuation) # Remove punctuation
## Warning in tm_map.SimpleCorpus(us_blogs_c, removePunctuation):
## transformation drops documents
us_blogs_c <- tm_map(us_blogs_c, stripWhitespace) # Remove white spaces
## Warning in tm_map.SimpleCorpus(us_blogs_c, stripWhitespace): transformation
## drops documents
us_blogs_c <- tm_map(us_blogs_c, content_transformer(tolower)) # Convert to lowercase
## Warning in tm_map.SimpleCorpus(us_blogs_c, content_transformer(tolower)):
## transformation drops documents
us_blogs_c <- tm_map(us_blogs_c, removeNumbers) # Remove numbers
## Warning in tm_map.SimpleCorpus(us_blogs_c, removeNumbers): transformation
## drops documents
us_news_c <- tm_map(us_news_c, removePunctuation) # Remove punctuation
## Warning in tm_map.SimpleCorpus(us_news_c, removePunctuation):
## transformation drops documents
us_news_c <- tm_map(us_news_c, stripWhitespace) # Remove white spaces
## Warning in tm_map.SimpleCorpus(us_news_c, stripWhitespace): transformation
## drops documents
us_news_c <- tm_map(us_news_c, content_transformer(tolower)) # Convert to lowercase
## Warning in tm_map.SimpleCorpus(us_news_c, content_transformer(tolower)):
## transformation drops documents
us_news_c <- tm_map(us_news_c, removeNumbers) # Remove numbers
## Warning in tm_map.SimpleCorpus(us_news_c, removeNumbers): transformation
## drops documents
us_twitter_c <- tm_map(us_twitter_c, removePunctuation) # Remove punctuation
## Warning in tm_map.SimpleCorpus(us_twitter_c, removePunctuation):
## transformation drops documents
us_twitter_c <- tm_map(us_twitter_c, stripWhitespace) # Remove white spaces
## Warning in tm_map.SimpleCorpus(us_twitter_c, stripWhitespace):
## transformation drops documents
us_twitter_c <- tm_map(us_twitter_c, content_transformer(tolower)) # Convert to lowercase
## Warning in tm_map.SimpleCorpus(us_twitter_c, content_transformer(tolower)):
## transformation drops documents
us_twitter_c <- tm_map(us_twitter_c, removeNumbers) # Remove numbers
## Warning in tm_map.SimpleCorpus(us_twitter_c, removeNumbers): transformation
## drops documents
#us_twitter_c <- tm_map(us_twitter_c, function(x) iconv(enc2utf8(x), sub = "byte"))
# Setting amount of n-grams
OneWordgram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
Fourwordsgram <- function(y) NGramTokenizer(y, Weka_control(min = 4, max = 4))
OneWord_blogs <- TermDocumentMatrix(us_blogs_c, control = list(tokenize = OneWordgram))
Fourwords_blogs <- TermDocumentMatrix(us_blogs_c, control = list(tokenize = Fourwordsgram))
OneWord_news <- TermDocumentMatrix(us_news_c, control = list(tokenize = OneWordgram))
Fourwords_news <- TermDocumentMatrix(us_news_c, control = list(tokenize = Fourwordsgram))
OneWord_twitter <- TermDocumentMatrix(us_twitter_c, control = list(tokenize = OneWordgram))
Fourwords_twitter <- TermDocumentMatrix(us_twitter_c, control = list(tokenize = Fourwordsgram))
# Sum of rows and sorting by n-gram frequency
freq_ngram <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_ngram <- data.frame(word=names(freq), freq=freq)
return(freq_ngram)
}
# Make matrices more dense, add up and sort
OneWord_blogs2 <- removeSparseTerms(OneWord_blogs, 0.999)
OneWord_blogs3 <- freq_ngram(OneWord_blogs2)
Fourwords_blogs2 <- removeSparseTerms(Fourwords_blogs, 0.9999)
Fourwords_blogs3 <- freq_ngram(Fourwords_blogs2)
OneWord_news2 <- removeSparseTerms(OneWord_news, 0.999)
OneWord_news3 <- freq_ngram(OneWord_news2)
Fourwords_news2 <- removeSparseTerms(Fourwords_news, 0.9999)
Fourwords_news3 <- freq_ngram(Fourwords_news2)
OneWord_twitter2 <- removeSparseTerms(OneWord_twitter, 0.999)
OneWord_twitter3 <- freq_ngram(OneWord_twitter2)
Fourwords_twitter2 <- removeSparseTerms(Fourwords_twitter, 0.9999)
Fourwords_twitter3 <- freq_ngram(Fourwords_twitter2)
#Histogram top 25 most used words - BLOGS
OWB <- ggplot(data = OneWord_blogs3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWB <- OWB + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in BLOGS")
OWB <- OWB + theme(axis.text.x=element_text(angle=90))
OWB
#Histogram top 25 most used words - NEWS
OWN <- ggplot(data = OneWord_news3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWN <- OWN + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in NEWS")
OWN <- OWN + theme(axis.text.x=element_text(angle=90))
OWN
#Histogram top 25 most used words - TWITTER
OWT <- ggplot(data = OneWord_twitter3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
OWT <- OWT + labs(x = "Single word", y = "Frequency", title = "Frequencies of the 25 Most Abundant Unigrams (individual words) in TWITTER")
OWT <- OWT + theme(axis.text.x=element_text(angle=90))
OWT
For some reason for Quadrants (group of 4 words) I could not make it work, even after spending hours searching on the web and the documentation I could not figure out why it did not work, so if anybody as any idea i ll appreciate suggestions, i ll leave the code for quadrants histograms here below.
#Histogram of the for quartets of words most used - BLOGS
FWB <- ggplot(data = Fourwords_blogs3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWB <- FWB + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in BLOGS")
FWB <- FWB + theme(axis.text.x=element_text(angle=90))
FWB
#Histogram of the for quartets of words most used - NEWS
FWN <- ggplot(data = Fourwords_news3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWN <- FWN + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in NEWS")
FWN <- FWN + theme(axis.text.x=element_text(angle=90))
FWN
#Histogram of the for quartets of words most used - TWITTER
FWN <- ggplot(data = Fourwords_twitter3[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
FWN <- FWN + labs(x = "Quadgrams", y = "Frequency", title = "Frequencies of the 25 Most Abundant Quadgrams (quartets of words) in TWITTER")
FWN <- FWN + theme(axis.text.x=element_text(angle=90))
FWN
Thanks for reviewing! JPL