Introduction

This report is part of the Coursera Data Science Capstone Project. The goal is to perform exploratory data analysis on a large corpus of English text data from blogs, news, and Twitter. This report demonstrates that the data has been loaded successfully, summarizes key statistics, and outlines the plan to build a next-word prediction algorithm and a Shiny web application.


Data Summary

Data Sources

The datasets are from the HC Corpora and include:

  • en_US.blogs.txt – Blog posts
  • en_US.news.txt – News articles
  • en_US.twitter.txt – Tweets

Basic File Statistics

library(stringi)

# Load the data
blogs <- readLines("C:/Users/Lenovo/Documents/final/en_US/en_US.blogs.txt", warn = FALSE)
news <- readLines("C:/Users/Lenovo/Documents/final/en_US/en_US.news.txt", warn = FALSE)
twitter <- readLines("C:/Users/Lenovo/Documents/final/en_US/en_US.twitter.txt", warn = FALSE)

# Get summary stats
stats <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  FileSize_MB = c(file.info("./final/en_US/en_US.blogs.txt")$size/1024^2,
                   file.info("./final/en_US/en_US.news.txt")$size/1024^2,
                   file.info("./final/en_US/en_US.twitter.txt")$size/1024^2),
  LineCount = c(length(blogs), length(news), length(twitter)),
  WordCount = c(sum(stri_count_words(blogs)),
                sum(stri_count_words(news)),
                sum(stri_count_words(twitter)))
)

knitr::kable(stats, caption = "Summary Statistics of the Data Files")
Summary Statistics of the Data Files
File FileSize_MB LineCount WordCount
Blogs NA 899288 37546806
News NA 77259 2674561
Twitter NA 2360148 30096649
## Line Length Analysis
library(ggplot2)

# Calculate line lengths
line_lengths <- data.frame(
  Source = rep(c("Blogs", "News", "Twitter"),
                c(length(blogs), length(news), length(twitter))),
  LineLength = c(nchar(blogs), nchar(news), nchar(twitter))
)

# Plot
ggplot(line_lengths, aes(x = LineLength, fill = Source)) +
  geom_histogram(binwidth = 100, show.legend = FALSE) +
  facet_wrap(~Source, scales = "free") +
  labs(title = "Line Length Distribution by Source", x = "Line Length (characters)", y = "Count")

## Word Frequency Analysis

library(tm)

# Sample to reduce memory usage
set.seed(123)
sample_data <- c(sample(blogs, 5000),
                  sample(news, 5000),
                  sample(twitter, 5000))

# Create corpus
corpus <- Corpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

# Term Document Matrix
tdm <- TermDocumentMatrix(corpus)
tdm <- removeSparseTerms(tdm, 0.99)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
df <- data.frame(word=names(word_freqs), freq=word_freqs)

# Plot
ggplot(df[1:20,], aes(x=reorder(word, freq), y=freq)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  labs(title="Top 20 Most Frequent Words", x="Word", y="Frequency")

Interesting Findings

Next Steps