Data Science Capstone: Milestone Report

# Install required packages if not already installed
required_packages <- c("stringr", "tm", "tidytext", "dplyr", "ggplot2")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

Introduction

This milestone report documents progress on the Capstone Project. It includes data loading, basic text cleaning, and initial exploration of word frequencies. The final goal is to create a word prediction app using Shiny.

Dataset Summary

library(tm)
library(tokenizers)
library(dplyr)
library(stringr)
library(tidytext)
library(data.table)
library(ggplot2)

# Define file paths
blog_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

# Initialize variables
blogs <- character(0)
news <- character(0)
twitter <- character(0)

# Load the data files with error handling
tryCatch({
  blogs <- readLines(blog_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
  cat("Successfully loaded blogs:", length(blogs), "lines\n")
}, error = function(e) {
  cat("Error loading blogs:", e$message, "\n")
})

## Successfully loaded blogs: 899288 lines

tryCatch({
  news <- readLines(news_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
  cat("Successfully loaded news:", length(news), "lines\n")
}, error = function(e) {
  cat("Error loading news:", e$message, "\n")
})

## Successfully loaded news: 1010206 lines

tryCatch({
  twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
  cat("Successfully loaded twitter:", length(twitter), "lines\n")
}, error = function(e) {
  cat("Error loading twitter:", e$message, "\n")
})

## Successfully loaded twitter: 2360148 lines

# Create summary table (only if data was loaded successfully)
if(length(blogs) > 0 && length(news) > 0 && length(twitter) > 0) {
  summary_table <- data.frame(
    Source = c("Blogs", "News", "Twitter"),
    Lines = c(length(blogs), length(news), length(twitter)),
    Words = c(
      sum(str_count(blogs, "\\w+")),
      sum(str_count(news, "\\w+")),
      sum(str_count(twitter, "\\w+"))
    )
  )
  print(summary_table)
} else {
  cat("Could not create summary table due to loading errors.\n")
}

##    Source   Lines    Words
## 1   Blogs  899288 38309620
## 2    News 1010206 35622913
## 3 Twitter 2360148 31003544

Data Sampling and Text Processing

# Set seed for reproducibility
set.seed(1234)

# Check if data was loaded successfully before sampling
if(length(blogs) > 0 && length(news) > 0 && length(twitter) > 0) {
  
  # Sample 1% of data from each source
  sample_data <- c(
    sample(blogs, max(1, length(blogs) * 0.01)),
    sample(news, max(1, length(news) * 0.01)),
    sample(twitter, max(1, length(twitter) * 0.01))
  )
  
  cat("Sample data created with", length(sample_data), "lines\n")
  
  # Create corpus and clean text
  corpus <- VCorpus(VectorSource(sample_data))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, content_transformer(removePunctuation))
  corpus <- tm_map(corpus, stripWhitespace)
  
  cat("Text cleaning completed\n")
  
} else {
  cat("Cannot proceed with sampling - data not loaded properly\n")
}

## Sample data created with 42695 lines
## Text cleaning completed

Word Frequency Analysis

# Check if corpus exists before proceeding
if(exists("corpus") && length(corpus) > 0) {
  # Convert to data frame for tidytext processing
  text_df <- data.frame(text = sapply(corpus, as.character), stringsAsFactors = FALSE)
  
  # Create unigrams and count frequencies
  unigrams <- text_df %>%
    unnest_tokens(word, text) %>%
    count(word, sort = TRUE)
  
  # Get top 20 words
  top_words <- head(unigrams, 20)
  
  # Display the table
  knitr::kable(top_words, caption = "Top 20 Most Frequent Words")
  
} else {
  cat("Cannot perform word frequency analysis - corpus not available\n")
}

Top 20 Most Frequent Words
word	n
the	47585
to	27635
and	23882
a	23646
of	20117
i	16485
in	16431
for	10994
is	10784
that	10416
you	9332
it	9131
on	7997
with	7073
was	6411
my	6025
at	5689
be	5535
this	5466
have	5226

Visualization

# Create bar plot of top words (only if data is available)
if(exists("top_words") && nrow(top_words) > 0) {
  ggplot(top_words, aes(x = reorder(word, n), y = n)) +
    geom_bar(stat = "identity", fill = "skyblue") +
    coord_flip() +
    labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
    theme_minimal()
} else {
  cat("Cannot create plot - word frequency data not available\n")
}

Next Steps

Implement n-gram analysis (bigrams, trigrams)
Build prediction algorithm
Create Shiny application interface
Optimize for performance and accuracy