# Install required packages if not already installed
required_packages <- c("stringr", "tm", "tidytext", "dplyr", "ggplot2")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
This milestone report documents progress on the Capstone Project. It includes data loading, basic text cleaning, and initial exploration of word frequencies. The final goal is to create a word prediction app using Shiny.
library(tm)
library(tokenizers)
library(dplyr)
library(stringr)
library(tidytext)
library(data.table)
library(ggplot2)
# Define file paths
blog_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_file <- "C:/Users/Shreyash/OneDrive/Documents/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
# Initialize variables
blogs <- character(0)
news <- character(0)
twitter <- character(0)
# Load the data files with error handling
tryCatch({
blogs <- readLines(blog_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
cat("Successfully loaded blogs:", length(blogs), "lines\n")
}, error = function(e) {
cat("Error loading blogs:", e$message, "\n")
})
## Successfully loaded blogs: 899288 lines
tryCatch({
news <- readLines(news_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
cat("Successfully loaded news:", length(news), "lines\n")
}, error = function(e) {
cat("Error loading news:", e$message, "\n")
})
## Successfully loaded news: 1010206 lines
tryCatch({
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
cat("Successfully loaded twitter:", length(twitter), "lines\n")
}, error = function(e) {
cat("Error loading twitter:", e$message, "\n")
})
## Successfully loaded twitter: 2360148 lines
# Create summary table (only if data was loaded successfully)
if(length(blogs) > 0 && length(news) > 0 && length(twitter) > 0) {
summary_table <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(
sum(str_count(blogs, "\\w+")),
sum(str_count(news, "\\w+")),
sum(str_count(twitter, "\\w+"))
)
)
print(summary_table)
} else {
cat("Could not create summary table due to loading errors.\n")
}
## Source Lines Words
## 1 Blogs 899288 38309620
## 2 News 1010206 35622913
## 3 Twitter 2360148 31003544
# Set seed for reproducibility
set.seed(1234)
# Check if data was loaded successfully before sampling
if(length(blogs) > 0 && length(news) > 0 && length(twitter) > 0) {
# Sample 1% of data from each source
sample_data <- c(
sample(blogs, max(1, length(blogs) * 0.01)),
sample(news, max(1, length(news) * 0.01)),
sample(twitter, max(1, length(twitter) * 0.01))
)
cat("Sample data created with", length(sample_data), "lines\n")
# Create corpus and clean text
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, stripWhitespace)
cat("Text cleaning completed\n")
} else {
cat("Cannot proceed with sampling - data not loaded properly\n")
}
## Sample data created with 42695 lines
## Text cleaning completed
# Check if corpus exists before proceeding
if(exists("corpus") && length(corpus) > 0) {
# Convert to data frame for tidytext processing
text_df <- data.frame(text = sapply(corpus, as.character), stringsAsFactors = FALSE)
# Create unigrams and count frequencies
unigrams <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
# Get top 20 words
top_words <- head(unigrams, 20)
# Display the table
knitr::kable(top_words, caption = "Top 20 Most Frequent Words")
} else {
cat("Cannot perform word frequency analysis - corpus not available\n")
}
word | n |
---|---|
the | 47585 |
to | 27635 |
and | 23882 |
a | 23646 |
of | 20117 |
i | 16485 |
in | 16431 |
for | 10994 |
is | 10784 |
that | 10416 |
you | 9332 |
it | 9131 |
on | 7997 |
with | 7073 |
was | 6411 |
my | 6025 |
at | 5689 |
be | 5535 |
this | 5466 |
have | 5226 |
# Create bar plot of top words (only if data is available)
if(exists("top_words") && nrow(top_words) > 0) {
ggplot(top_words, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Word", y = "Frequency") +
theme_minimal()
} else {
cat("Cannot create plot - word frequency data not available\n")
}