Introduction

This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization. It highlights the major features of the data, explores summary statistics, and outlines plans for creating a word prediction algorithm and Shiny app.


Data Loading and Summary Statistics

The datasets used are en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. Below is a summary of the basic statistics for each dataset, including file size, line count, and word count.

# Load required libraries
library(knitr)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(stringi)

# File paths
file.list <- c(
  "C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", 
  "C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", 
  "C:/Users/aaron/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
)

# Initialize variables
text <- list(blogs = "", news = "", twitter = "")
matrix.summary <- matrix(
  0, nrow = 3, ncol = 3, 
  dimnames = list(c("blogs", "news", "twitter"), c("File Size (MB)", "Lines", "Words"))
)

# Read data and populate summary matrix
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
  close(con)
  matrix.summary[i, 1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  matrix.summary[i, 2] <- length(text[[i]])
  matrix.summary[i, 3] <- sum(stri_count_words(text[[i]]))
}

# Display summary table
kable(matrix.summary)
File Size (MB) Lines Words
blogs 200.42 899288 37546806
news 196.28 1010242 34762658
twitter 159.36 2360148 30096690
set.seed(123)
# Sampling
blogs_sample <- sample(text$blogs, 0.005 * length(text$blogs))
news_sample <- sample(text$news, 0.005 * length(text$news))
twitter_sample <- sample(text$twitter, 0.005 * length(text$twitter))

# Preprocessing function
preprocess_text <- function(sample) {
  corpus <- Corpus(VectorSource(sample))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

# Apply preprocessing
corpus_blogs <- preprocess_text(blogs_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus_news <- preprocess_text(news_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus_twitter <- preprocess_text(twitter_sample)
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
# Blogs analysis
term.doc.matrix_blogs <- TermDocumentMatrix(corpus_blogs)
freq_blogs <- sort(rowSums(as.matrix(term.doc.matrix_blogs)), decreasing = TRUE)
top_blogs <- head(freq_blogs, 10)

# Barplot
barplot(
  top_blogs, main = "Blogs Data: Most Frequent Words", 
  xlab = "Word", ylab = "Count", col = "skyblue"
)

# Word cloud
wordcloud(
  names(freq_blogs), freq_blogs, min.freq = 100, 
  random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)

# News analysis
term.doc.matrix_news <- TermDocumentMatrix(corpus_news)
freq_news <- sort(rowSums(as.matrix(term.doc.matrix_news)), decreasing = TRUE)
top_news <- head(freq_news, 10)

# Barplot
barplot(
  top_news, main = "News Data: Most Frequent Words", 
  xlab = "Word", ylab = "Count", col = "lightgreen"
)

# Word cloud
wordcloud(
  names(freq_news), freq_news, min.freq = 100, 
  random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)

# Twitter analysis
term.doc.matrix_twitter <- TermDocumentMatrix(corpus_twitter)
freq_twitter <- sort(rowSums(as.matrix(term.doc.matrix_twitter)), decreasing = TRUE)
top_twitter <- head(freq_twitter, 10)

# Barplot
barplot(
  top_twitter, main = "Twitter Data: Most Frequent Words", 
  xlab = "Word", ylab = "Count", col = "pink"
)

# Word cloud
wordcloud(
  names(freq_twitter), freq_twitter, min.freq = 100, 
  random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2")
)