This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization. It highlights the major features of the data, explores summary statistics, and outlines plans for creating a word prediction algorithm and Shiny app.
The datasets used are en_US.blogs.txt,
en_US.news.txt, and en_US.twitter.txt. Below
is a summary of the basic statistics for each dataset, including file
size, line count, and word count.
# Load required libraries
library(knitr)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(stringi)
# File paths
file.list <- c(
"C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt",
"C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt",
"C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
)
# Initialize variables
text <- list(blogs = "", news = "", twitter = "")
matrix.summary <- matrix(
0, nrow = 3, ncol = 3,
dimnames = list(c("blogs", "news", "twitter"), c("File Size (MB)", "Lines", "Words"))
)
# Read data and populate summary matrix
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
matrix.summary[i, 1] <- round(file.info(file.list[i])$size / 1024^2, 2)
matrix.summary[i, 2] <- length(text[[i]])
matrix.summary[i, 3] <- sum(stri_count_words(text[[i]]))
}
# Display summary table
kable(matrix.summary)
| File Size (MB) | Lines | Words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546806 |
| news | 196.28 | 1010242 | 34762658 |
| 159.36 | 2360148 | 30096690 |
set.seed(123)
# Sampling
blogs_sample <- sample(text$blogs, 0.005 * length(text$blogs))
news_sample <- sample(text$news, 0.005 * length(text$news))
twitter_sample <- sample(text$twitter, 0.005 * length(text$twitter))
# Preprocessing function
preprocess_text <- function(sample) {
corpus <- Corpus(VectorSource(sample))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
# Apply preprocessing
corpus_blogs <- preprocess_text(blogs_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus_news <- preprocess_text(news_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus_twitter <- preprocess_text(twitter_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
# Blogs analysis
term.doc.matrix_blogs <- TermDocumentMatrix(corpus_blogs)
freq_blogs <- sort(rowSums(as.matrix(term.doc.matrix_blogs)), decreasing = TRUE)
top_blogs <- head(freq_blogs, 10)
# Barplot
barplot(
top_blogs, main = "Blogs Data: Most Frequent Words",
xlab = "Word", ylab = "Count", col = "skyblue"
)
# Word cloud
wordcloud(
names(freq_blogs), freq_blogs, min.freq = 100,
random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)
# News analysis
term.doc.matrix_news <- TermDocumentMatrix(corpus_news)
freq_news <- sort(rowSums(as.matrix(term.doc.matrix_news)), decreasing = TRUE)
top_news <- head(freq_news, 10)
# Barplot
barplot(
top_news, main = "News Data: Most Frequent Words",
xlab = "Word", ylab = "Count", col = "lightgreen"
)
# Word cloud
wordcloud(
names(freq_news), freq_news, min.freq = 100,
random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)
# Twitter analysis
term.doc.matrix_twitter <- TermDocumentMatrix(corpus_twitter)
freq_twitter <- sort(rowSums(as.matrix(term.doc.matrix_twitter)), decreasing = TRUE)
top_twitter <- head(freq_twitter, 10)
# Barplot
barplot(
top_twitter, main = "Twitter Data: Most Frequent Words",
xlab = "Word", ylab = "Count", col = "pink"
)
# Word cloud
wordcloud(
names(freq_twitter), freq_twitter, min.freq = 100,
random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)