This is the milestone report for Data Science Capstone project from Coursera Data Science Specialization. The goal of this report is to load and explore the major features of the text data sets, summarize the data, and explore the data to understand the frequency distribution of words. The end-goal is to create text-prediction application with R’s Shiny package that predicts words using a natural language processing model.
The first step here is to get an idea of what kinds of pre-processing will be necessary to prepare the data for creating the model. Specifically, certain kinds of characters and words need to be removed and/or modified to aid in prediction accuracy. Finally, I need to create list of single words, and two/three word phrases to see which occur most frequently.
library(stringi)
library(dplyr)
library(knitr)
library(tm)
library(wordcloud)
library(RColorBrewer)
fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")){
download.file(fileUrl, destfile = "Coursera-SwiftKey.zip")
}
unzip("Coursera-SwiftKey.zip")
Build a table for the data provided to summarize the file space, Number of line used and words in each catogaries (News, Blog, twitter).
file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
data.summary[i,2] <- length(text[[i]])
data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
kable(data.summary)
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
As data provided are very big hence We need small fraction to get a sample i.e random line selection to make a sample.
set.seed(123)
blogs_sample <- sample(text$blogs, 0.005*length(text$blogs))
news_sample <- sample(text$news, 0.005*length(text$news))
twitter_sample <- sample(text$twitter, 0.005*length(text$twitter))
clean the data with following rules:
Clean the data from blog
# Create corpus for blog corpus1
corpus1 <- Corpus(VectorSource(blogs_sample))
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, removeNumbers)
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
corpus1 <- tm_map(corpus1, stripWhitespace)
Clean the data from News
# Create corpus for news corpus2
corpus2 <- Corpus(VectorSource(news_sample))
corpus2 <- tm_map(corpus2, content_transformer(tolower))
corpus2 <- tm_map(corpus2, removePunctuation)
corpus2 <- tm_map(corpus2, removeNumbers)
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
corpus2 <- tm_map(corpus2, stripWhitespace)
Clean the data from Twitter
# Create corpus for Twitter corpus3
corpus3 <- Corpus(VectorSource(twitter_sample))
## Convert Character Vector between Encodings
corpus3 <- tm_map(corpus3, content_transformer(function(x)
iconv(x, to = "UTF-8", sub = "byte")))
corpus3 <- tm_map(corpus3, content_transformer(tolower))
corpus3 <- tm_map(corpus3, removePunctuation)
corpus3 <- tm_map(corpus3, removeNumbers)
corpus3 <- tm_map(corpus3, removeWords, stopwords("english"))
corpus3 <- tm_map(corpus3, stripWhitespace)
We use barplot and wordcloud to identifiy most frequent word used.
Blog - barplot
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Blogs Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
Blog - WordCloud
term.doc.matrix1 <- TermDocumentMatrix(corpus1)
term.doc.matrix1 <- as.matrix(term.doc.matrix1)
word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE)
dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1)
wordcloud(dm1$word, dm1$freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
News - barplot
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "News Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
News - WordCloud
term.doc.matrix2 <- TermDocumentMatrix(corpus2)
term.doc.matrix2 <- as.matrix(term.doc.matrix2)
word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE)
dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2)
wordcloud(dm2$word, dm2$freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
Twitter - barplot
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Twitter Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
Twitter - WordCloud
term.doc.matrix3 <- TermDocumentMatrix(corpus3)
term.doc.matrix3 <- as.matrix(term.doc.matrix3)
word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE)
dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3)
wordcloud(dm3$word, dm3$freq, min.freq= 100, random.order=FALSE, rot.per=.25, colors=brewer.pal(8, "Dark2"))