The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
Tasks to accomplish
Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
path1 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.blogs.txt"
path2 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.news.txt"
path3 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.twitter.txt"
con <- file(path1, open = "rb")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(path2, open = "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
con <- file(path3, open = "rb")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
library(RColorBrewer)
library(wordcloud)
library(slam)
library(ggplot2)
library(gridExtra)
wordcloud.print <- function(x){
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
Headings= c("Word Cloud - US English Blogs", "Word Cloud - US English News", "Word Cloud - US English Twitter")
# Iterate each corpus and DTM and plot word cloud (Max = 100)
for (i in 1:length(levels(as.factor(x$source)))) {
wordcloud(words = x[x$source == levels(as.factor(x$source))[i], 'word'], scale = c(1, .1),
freq = x[x$source == levels(as.factor(x$source))[i], 'freq'],
max.words = 20, random.order = FALSE, rot.per = 0.45, use.r.layout = FALSE,
colors = brewer.pal(8, "Dark2"))
title(Headings[i])
}
}
Ngrams.plot <- function (x1, x2, x3, n) {
type <- c("Unigrams", "Bigrams", "Trigrams")
g1 <- ggplot(x1, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity",
fill = "blue" ) + ggtitle("Blogs") + xlab(type[n]) + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
g2 <- ggplot(x2, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity",
fill = "red" ) + ggtitle("News") + xlab(type[n]) + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
g3 <- ggplot(x3, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity",
fill = "green" ) + ggtitle("Twitter") + xlab(type[n]) + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
grid.arrange(g1, g2, g3, ncol = 3)
}
freq_frame <- function(x){ # x is dataframe
v <- colSums(x)
d <- data.frame(word = names(x), freq = v)
d <- d %>% arrange(desc(freq))
return(d)
}
freq_frame_tok <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_frame_tok <- data.frame(word=names(freq), freq=freq)
return(freq_frame_tok)
}
statistics <- data.frame('File'= c("Blogs", "News", "Twitter"),
'File.Size' = c(file.info(path1)$size, file.info(path2)$size, file.info(path3)$size)/1024^2,
'Num.Entries' = sapply(list(blogs, news, twitter), function(x){length(x)}),
'Words.Count' = sapply(list(blogs, news, twitter), stri_stats_latex)[4,],
'Total.Characteres' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))})
)
kable(statistics)
wordcloud.print(dfunigrams)
Ngrams.plot(df1.blogs[1:10,], df1.news[1:10,], df1.twitter[1:10,], 1)
wordcloud.print(dfbigrams)
Ngrams.plot(df2.blogs[1:10,], df2.news[1:10,], df2.twitter[1:10,], 2)
wordcloud.print(dftrigrams)
Ngrams.plot(df3.blogs[1:10,], df3.news[1:10,], df3.twitter[1:10,], 3)
set.seed(3456)
sampleBlogs <- blogs[sample(1:length(blogs), 0.03*length(blogs), replace = FALSE)]
sampleNews <- news[sample(1:length(news), 0.03*length(news), replace = FALSE)]
sampleTwitter <- twitter[sample(1:length(twitter), 0.03*length(twitter), replace = FALSE)]
sblogs <- iconv(sampleBlogs, "UTF-8", "ASCII", "byte")
snews <- iconv(sampleNews, "UTF-8", "ASCII", "byte")
stwitter <- iconv(sampleTwitter, "UTF-8", "ASCII", "byte")
sdata <- list(sblogs, snews, stwitter)
rm(blogs, news, twitter, sampleBlogs, sampleNews, sampleTwitter)
vcorpus <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(sdata)) {
# Create corpus dataset
vcorpus[[i]] <- VCorpus(VectorSource(sdata[[i]]), readerControl = list(reader = readPlain, language="en"))
# Cleaning Up corpus dataset
vcorpus[[i]] <- tm_map(vcorpus[[i]], tolower)
# Eleminate punctuation
vcorpus[[i]] <- tm_map(vcorpus[[i]], removePunctuation)
# Strip Whitespace
vcorpus[[i]] <- tm_map(vcorpus[[i]], stripWhitespace)
# Create plain text format
vcorpus[[i]] <- tm_map(vcorpus[[i]], PlainTextDocument)
}
v1corpus <- vcorpus
# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
df <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v1corpus)) {
# Eleminate English stop words
v1corpus[[i]] <- tm_map(v1corpus[[i]], removeWords, stopwords("english"))
# Perform stemming
v1corpus[[i]] <- tm_map(v1corpus[[i]], stemDocument)
# Calculate document term frequency for corpus
frequencies[[i]] <- DocumentTermMatrix(v1corpus[[i]])
# Remove sparse terms
sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.99)
# Convert to data frame
df[[i]] = as.data.frame(as.matrix(sparse[[i]]))
colnames(df[[i]]) = make.names(colnames(df[[i]]))
}
df1.blogs <- freq_frame(df[[1]]); df1.blogs$source <- "blogs"
df1.news <- freq_frame(df[[2]]); df1.news$source <- "news"
df1.twitter <- freq_frame(df[[3]]); df1.twitter$source <- "twitter"
dfunigrams <- rbind(df1.blogs[1:20,], df1.news[1:20,], df1.twitter[1:20,])
rm(v1corpus, frequencies, sparse, df)
v2corpus <- vcorpus
BigramTokenizer <- function(x) RWeka::NGramTokenizer(x, Weka_control(min = 2, max = 2))
# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v2corpus)) {
# Eleminate English stop words
v2corpus[[i]] <- tm_map(v2corpus[[i]], removeWords, stopwords("english"))
# Perform stemming
# vcorpus[[i]] <- tm_map(vcorpus[[i]], stemDocument)
# Calculate document term frequency for corpus
frequencies[[i]] <- TermDocumentMatrix(v2corpus[[i]], control = list(tokenize = BigramTokenizer))
# Remove sparse terms
sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.999)
}
df2.blogs <- freq_frame_tok(sparse[[1]]); df2.blogs$source <- "blogs"
df2.news <- freq_frame_tok(sparse[[2]]); df2.news$source <- "news"
df2.twitter <- freq_frame_tok(sparse[[3]]); df2.twitter$source <- "twitter"
dfbigrams <- rbind(df2.blogs[1:20,], df2.news[1:20,], df2.twitter[1:20,])
rm(v2corpus, frequencies, sparse)
v3corpus <- vcorpus
TrigramTokenizer <- function(x) RWeka::NGramTokenizer(x, Weka_control(min = 3, max = 3))
# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v3corpus)) {
# Eleminate English stop words
v3corpus[[i]] <- tm_map(v3corpus[[i]], removeWords, stopwords("english"))
# Calculate document term frequency for corpus
frequencies[[i]] <- TermDocumentMatrix(v3corpus[[i]], control = list(tokenize = TrigramTokenizer))
# Remove sparse terms
sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.9999)
}
df3.blogs <- freq_frame_tok(sparse[[1]]); df3.blogs$source <- "blogs"
df3.news <- freq_frame_tok(sparse[[2]]); df3.news$source <- "news"
df3.twitter <- freq_frame_tok(sparse[[3]]); df3.twitter$source <- "twitter"
dftrigrams <- rbind(df3.blogs[1:20,], df3.news[1:20,], df3.twitter[1:20,])
rm(v3corpus, frequencies, sparse, vcorpus)