This report is produced in part of the peer-graded assignment for Data Science Capstone course on Coursera. This report details the exploratory data analysis conducted on the dataset as provided in the course.
# Load packages
library(stringr)
library(RWeka)
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(plyr)
The dataset contains text extracted from various blog sites, news sites and Twitter. The dataset is loaded into the workspace:
blogs <- readLines("final/en_US/en_US.blogs.txt",encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt",encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
The first 3 rows of each source:
head(blogs, 3)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
head(news, 3)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
head(twitter, 3)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
A dataframe containing the number of lines and word count in each source is created (full.df
), and the bar chart visualizes the results.
# Create data frame to store the data
full.df <- data.frame(source = c("blogs", "news", "twitter"))
# Place sources into a list
list.source <- list(blogs = blogs, news = news, twitter = twitter)
# Count number of lines and words
full.df$line.count <- sapply(list.source, length)
full.df$word.count <- sapply(list.source, function(x) {sum(str_count(x, "\\S+"))})
ggplot(full.df, aes(x = as.factor(source), y = line.count, fill = as.factor(source))) +
geom_bar(stat = "identity") +
geom_text(aes(label = line.count), vjust=1.0)+
labs(x = "Source", y = "Number of lines",
title = "Lines per source", fill = "Source")
ggplot(full.df, aes(x = as.factor(source), y = word.count, fill = as.factor(source))) +
geom_bar(stat = "identity") +
geom_text(aes(label = word.count), vjust=1.0)+
labs(x = "Source", y = "Number of words",
title = "Words per source", fill = "Source")
Figure 1: We see that text from news source are the least in both cases.
Since it is not necessary to analyze each text, random sampling is applied to select a subset of text from each source. The number of lines and words in this subset of data is displayed in the bar chart below.
# Select 5% from each source
n <- round(full.df$line.count * 0.05)
set.seed(123) # for reproducible results
blogs.sample <- blogs[floor(runif(n[1], 1, length(blogs)))]
news.sample <- news[floor(runif(n[2], 1, length(news)))]
twitter.sample <- twitter[floor(runif(n[3], 1, length(twitter)))]
# Create a data frame to store data
sample.df <- data.frame(source = c("blog", "twitter", "news"))
sample.list <- list(blogs.sample, news.sample, twitter.sample)
# get counts of sample.list
sample.df$line.count <- sapply(sample.list, length)
sample.df$word.count <- sapply(sample.list, function(x) {sum(str_count(x, "\\S+"))})
ggplot(sample.df, aes(x = as.factor(source), y = line.count, fill = as.factor(source))) +
geom_bar(stat = "identity") +
geom_text(aes(label = line.count), vjust=1.0)+
labs(x = "Source", y = "Number of lines",
title = "Lines per source", fill = "Source")
ggplot(sample.df, aes(x = as.factor(source), y = word.count, fill = as.factor(source))) +
geom_bar(stat = "identity") +
geom_text(aes(label = word.count), vjust=1.0)+
labs(x = "Source", y = "Number of words",
title = "Words per source", fill = "Source")
Figure 2: This time, text from Twitter are the least in both cases.
The text are compiled into one Corpus. They are then pre-processed before proceeding to the analysis section (remove puntuation, numbers, converting words into lowercase, etc.). Offensive and inappropriate words are removed using list of words downloaded from Carnegie Mellon University’s School of Computer Science. URLs, and Twitter hashtags and handles are also removed from the corpus as it does not have any effect on the analysis.
# Create corpus
doc <- Corpus(VectorSource(list(c(blogs.sample, news.sample, twitter.sample))))
# Pre-processing
doc <- tm_map(doc, content_transformer(tolower))
doc <- tm_map(doc, removePunctuation)
doc <- tm_map(doc, removeNumbers)
doc <- tm_map(doc, removeWords, stopwords("english"))
# Remove bad words
googlebadwords <- read.delim("final/bad-words.txt",sep = ":",header = FALSE)
googlebadwords <- googlebadwords[,1]
doc <- tm_map(doc, removeWords, googlebadwords)
# Strip/Remove excess whitespace
doc <- tm_map(doc, stripWhitespace)
# Save as external file
# Re-read the file into the workspace
writeCorpus(doc, filenames="my_corpus.txt")
my.corpus <- readLines("my_corpus.txt")
The words are tokenized into unigrams, bigrams and trigrams. The function for n-gram analysis, ngram_tokenizer
is created by Maciej Szymkiewicz available for public.
The words that are tokenized are put into a data frame along with their frequencies. In the following subsections, the top words in each analysis is displayed.
source("Ngrams_Tokenizer.R")
unigram.tokenizer <- ngram_tokenizer(1)
wordlist <- unigram.tokenizer(my.corpus)
unigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram.df) <- c("word","freq")
unigram.df <- unigram.df[with(unigram.df, order(-unigram.df$freq)),]
row.names(unigram.df) <- NULL
ggplot(head(unigram.df,15), aes(x=reorder(word,-freq), y=freq, fill=word)) +
geom_bar(stat="Identity") +
geom_text(aes(label=freq), vjust = -0.5) + ylim(0, 14000) +
labs(x = "Term", y = "Frequency", title = "Unigrams Frequency") +
theme(legend.position="none")
bigram.tokenizer <- ngram_tokenizer(2)
wordlist <- bigram.tokenizer(my.corpus)
bigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram.df) <- c("word","freq")
bigram.df <- bigram.df[with(bigram.df, order(-bigram.df$freq)),]
row.names(bigram.df) <- NULL
ggplot(head(bigram.df,15), aes(x=reorder(word,-freq), y=freq, fill=word)) +
geom_bar(stat="Identity") +
geom_text(aes(label=freq), hjust = -0.5) + ylim(0,1400) +
labs(x = "Term", y = "Frequency", title = "Bigrams Frequency") +
theme(legend.position="none") + coord_flip()
trigram.tokenizer <- ngram_tokenizer(3)
wordlist <- trigram.tokenizer(my.corpus)
trigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram.df) <- c("word","freq")
trigram.df <- trigram.df[with(trigram.df, order(-trigram.df$freq)),]
row.names(trigram.df) <- NULL
ggplot(head(trigram.df,15), aes(x=reorder(word,-freq), y=freq, fill=word)) +
geom_bar(stat="Identity") +
geom_text(aes(label=freq), hjust = -0.5) + ylim(0,200) +
labs(x = "Term", y = "Frequency", title = "Trigrams Frequency") +
theme(legend.position="none") + coord_flip()