This report analyzes text data from three sources (Twitter, Blogs, and News) to build a next-word prediction algorithm. The algorithm will help users type faster on mobile devices, similar to SwiftKey and Gboard.
Key Questions Answered:
Before exploring the data, we apply light cleaning to ensure accurate statistics:
Important: We do NOT remove stop words, profanity, or punctuation in this phase. Those will be handled when building the actual prediction model.
# Define file paths
zip_file <- "Coursera-SwiftKey.zip"
data_dir <- "final/en_US/"
# Conditional download - only if files don't exist
if (!file.exists(data_dir)) {
message("Data files not found. Downloading...")
if (!file.exists(zip_file)) {
message("Downloading Coursera-SwiftKey.zip...")
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile = zip_file)
} else {
message("Zip file already exists. Skipping download...")
}
message("Unzipping files...")
unzip(zip_file)
message("Files unzipped successfully.")
} else {
message("Data files already exist. Skipping download and unzip.")
}## Data files already exist. Skipping download and unzip.
# Light cleaning function for EDA (preserves stop words, profanity, punctuation)
light_clean <- function(text_vector) {
cleaned <- text_vector %>%
# Convert to lowercase
tolower() %>%
# Remove URLs
gsub("http\\S+|www\\S+|https\\S+", "", .) %>%
# Remove extra whitespace
gsub("\\s+", " ", .) %>%
# Trim leading/trailing whitespace
trimws()
# Remove empty strings after cleaning
cleaned <- cleaned[cleaned != ""]
return(cleaned)
}get_file_stats <- function(file_path) {
if (!file.exists(file_path)) {
warning(paste("File not found:", file_path))
return(NULL)
}
con <- file(file_path, "r", encoding = "UTF-8")
line_count <- 0
max_chars <- 0
total_chars <- 0
word_count <- 0
while(TRUE) {
lines <- readLines(con, 10000, skipNul = TRUE)
if(length(lines) == 0) break
# Apply light cleaning
lines <- light_clean(lines)
line_count <- line_count + length(lines)
char_lengths <- nchar(lines)
max_chars <- max(max_chars, max(char_lengths, na.rm = TRUE))
total_chars <- total_chars + sum(char_lengths, na.rm = TRUE)
# Count words (split on whitespace)
words_in_chunk <- sum(sapply(gregexpr("\\S+", lines), function(x) sum(x > 0)))
word_count <- word_count + words_in_chunk
}
close(con)
# Calculate average words per line
avg_words_per_line <- round(word_count / line_count, 1)
return(data.frame(
File = basename(file_path),
Lines = line_count,
Total_Words = word_count,
Total_Characters = total_chars,
Max_Line_Length = max_chars,
Avg_Words_Per_Line = avg_words_per_line
))
}# Define file paths
twitter_path <- "final/en_US/en_US.twitter.txt"
blogs_path <- "final/en_US/en_US.blogs.txt"
news_path <- "final/en_US/en_US.news.txt"
# Calculate statistics for each file
twitter_full_stats <- get_file_stats(twitter_path)
blogs_full_stats <- get_file_stats(blogs_path)
news_full_stats <- get_file_stats(news_path)
# Combine into one table
full_stats <- rbind(twitter_full_stats, blogs_full_stats, news_full_stats)
# Display the statistics
print("=== FULL DATASET STATISTICS (After Light Cleaning) ===")## [1] "=== FULL DATASET STATISTICS (After Light Cleaning) ==="
## File Lines Total_Words Total_Characters Max_Line_Length
## 1 en_US.twitter.txt 2360100 30362563 161816530 140
## 2 en_US.blogs.txt 899187 37331739 206718241 40833
## 3 en_US.news.txt 1010183 34367406 203113694 11384
## Avg_Words_Per_Line
## 1 12.9
## 2 41.5
## 3 34.0
sample_file <- function(file_path, sample_prob = 0.05) {
if (!file.exists(file_path)) {
warning(paste("File not found:", file_path))
return(NULL)
}
message(paste("Sampling from", basename(file_path), "-", sample_prob * 100, "%"))
con <- file(file_path, "r", encoding = "UTF-8")
sampled_lines <- c()
chunk_num <- 0
total_lines_read <- 0
while(TRUE) {
lines <- readLines(con, 5000, skipNul = TRUE)
if(length(lines) == 0) break
# Apply light cleaning
lines <- light_clean(lines)
total_lines_read <- total_lines_read + length(lines)
chunk_num <- chunk_num + 1
# Use rbinom for random sampling
keep <- rbinom(length(lines), 1, sample_prob)
sampled_chunk <- lines[keep == 1]
sampled_lines <- c(sampled_lines, sampled_chunk)
if(chunk_num %% 100 == 0) {
message(paste(" Processed", total_lines_read, "lines..."))
}
}
close(con)
message(paste(" Sampled", length(sampled_lines), "lines"))
return(sampled_lines)
}# Set seed for reproducibility
set.seed(1234)
# Sampling proportion (5% is typical for this dataset)
sample_proportion <- 0.05
# Check if saved sample already exists
sample_file_path <- "text_sample_cleaned.RData"
if (file.exists(sample_file_path)) {
message("Saved sample found. Loading existing sample...")
load(sample_file_path)
message("Existing sample loaded successfully.")
} else {
message("No existing sample found. Creating new samples...")
# Take samples from each file
twitter_sample <- sample_file(twitter_path, sample_proportion)
blogs_sample <- sample_file(blogs_path, sample_proportion)
news_sample <- sample_file(news_path, sample_proportion)
# Combine all samples
all_text <- c(twitter_sample, blogs_sample, news_sample)
# Save the sample for reuse
save(twitter_sample, blogs_sample, news_sample, all_text, file = sample_file_path)
message(paste("\nSamples saved to", sample_file_path))
}## Saved sample found. Loading existing sample...
## Existing sample loaded successfully.
##
## === SAMPLE SIZES (After Light Cleaning) ===
## Twitter sample lines: 118090
## Blogs sample lines: 45237
## News sample lines: 50661
## Combined sample lines: 213988
# Function to count words
count_words <- function(text_vector) {
if (is.null(text_vector) || length(text_vector) == 0) return(0)
words <- unlist(strsplit(text_vector, "\\s+"))
words <- words[words != ""]
return(length(words))
}
# Calculate sample statistics
twitter_sample_words <- count_words(twitter_sample)
blogs_sample_words <- count_words(blogs_sample)
news_sample_words <- count_words(news_sample)
# Create sample stats table
sample_stats <- data.frame(
Source = c("Twitter", "Blogs", "News"),
Sample_Lines = c(length(twitter_sample), length(blogs_sample), length(news_sample)),
Sample_Words = c(twitter_sample_words, blogs_sample_words, news_sample_words),
Sample_Avg_Words_Per_Line = c(
round(twitter_sample_words / length(twitter_sample), 1),
round(blogs_sample_words / length(blogs_sample), 1),
round(news_sample_words / length(news_sample), 1)
)
)
print("=== SAMPLE STATISTICS (After Light Cleaning) ===")## [1] "=== SAMPLE STATISTICS (After Light Cleaning) ==="
## Source Sample_Lines Sample_Words Sample_Avg_Words_Per_Line
## 1 Twitter 118090 1520997 12.9
## 2 Blogs 45237 1878709 41.5
## 3 News 50661 1721110 34.0
# Create comprehensive findings table from actual data
key_findings <- data.frame(
Metric = c("Total lines",
"Total words (millions)",
"Average words per line",
"Maximum line length (characters)",
"Sample size (% of original)",
"Sample words (thousands)"),
Twitter = c(format(full_stats[full_stats$File == "en_US.twitter.txt", "Lines"], big.mark = ","),
round(full_stats[full_stats$File == "en_US.twitter.txt", "Total_Words"] / 1e6, 1),
full_stats[full_stats$File == "en_US.twitter.txt", "Avg_Words_Per_Line"],
format(full_stats[full_stats$File == "en_US.twitter.txt", "Max_Line_Length"], big.mark = ","),
paste0(round(length(twitter_sample) / full_stats[full_stats$File == "en_US.twitter.txt", "Lines"] * 100, 1), "%"),
round(twitter_sample_words / 1000, 1)),
Blogs = c(format(full_stats[full_stats$File == "en_US.blogs.txt", "Lines"], big.mark = ","),
round(full_stats[full_stats$File == "en_US.blogs.txt", "Total_Words"] / 1e6, 1),
full_stats[full_stats$File == "en_US.blogs.txt", "Avg_Words_Per_Line"],
format(full_stats[full_stats$File == "en_US.blogs.txt", "Max_Line_Length"], big.mark = ","),
paste0(round(length(blogs_sample) / full_stats[full_stats$File == "en_US.blogs.txt", "Lines"] * 100, 1), "%"),
round(blogs_sample_words / 1000, 1)),
News = c(format(full_stats[full_stats$File == "en_US.news.txt", "Lines"], big.mark = ","),
round(full_stats[full_stats$File == "en_US.news.txt", "Total_Words"] / 1e6, 1),
full_stats[full_stats$File == "en_US.news.txt", "Avg_Words_Per_Line"],
format(full_stats[full_stats$File == "en_US.news.txt", "Max_Line_Length"], big.mark = ","),
paste0(round(length(news_sample) / full_stats[full_stats$File == "en_US.news.txt", "Lines"] * 100, 1), "%"),
round(news_sample_words / 1000, 1))
)
print("=== KEY FINDINGS AT A GLANCE ===")## [1] "=== KEY FINDINGS AT A GLANCE ==="
## Metric Twitter Blogs News
## 1 Total lines 2,360,100 899,187 1,010,183
## 2 Total words (millions) 30.4 37.3 34.4
## 3 Average words per line 12.9 41.5 34
## 4 Maximum line length (characters) 140 40,833 11,384
## 5 Sample size (% of original) 5% 5% 5%
## 6 Sample words (thousands) 1521 1878.7 1721.1
# Create data frames for plotting
volume_data <- data.frame(
Source = c("Twitter", "Blogs", "News"),
Lines = c(full_stats[full_stats$File == "en_US.twitter.txt", "Lines"],
full_stats[full_stats$File == "en_US.blogs.txt", "Lines"],
full_stats[full_stats$File == "en_US.news.txt", "Lines"]) / 1e6,
Words = c(full_stats[full_stats$File == "en_US.twitter.txt", "Total_Words"],
full_stats[full_stats$File == "en_US.blogs.txt", "Total_Words"],
full_stats[full_stats$File == "en_US.news.txt", "Total_Words"]) / 1e6
)
# Lines bar chart
p1 <- ggplot(volume_data, aes(x = Source, y = Lines, fill = Source)) +
geom_bar(stat = "identity") +
labs(title = "Total Lines (Millions)",
y = "Millions of Lines",
x = "") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("#1DA1F2", "#FF9900", "#4CAF50")) +
geom_text(aes(label = round(Lines, 1)), vjust = -0.5)
# Words bar chart
p2 <- ggplot(volume_data, aes(x = Source, y = Words, fill = Source)) +
geom_bar(stat = "identity") +
labs(title = "Total Words (Millions)",
y = "Millions of Words",
x = "") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("#1DA1F2", "#FF9900", "#4CAF50")) +
geom_text(aes(label = round(Words, 1)), vjust = -0.5)
grid.arrange(p1, p2, ncol = 2)# Calculate line lengths from cleaned samples
twitter_lengths <- nchar(twitter_sample)
blogs_lengths <- nchar(blogs_sample)
news_lengths <- nchar(news_sample)
# Create data frame for plotting
length_data <- data.frame(
Length = c(twitter_lengths, blogs_lengths, news_lengths),
Source = c(rep("Twitter", length(twitter_lengths)),
rep("Blogs", length(blogs_lengths)),
rep("News", length(news_lengths)))
)
# Filter to 99th percentile for better visualization
length_data_filtered <- length_data %>%
group_by(Source) %>%
filter(Length <= quantile(Length, 0.99))
# Histogram with facets
ggplot(length_data_filtered, aes(x = Length, fill = Source)) +
geom_histogram(alpha = 0.7, bins = 50, position = "identity") +
facet_wrap(~Source, scales = "free_y") +
labs(title = "Distribution of Text Length by Source (After Light Cleaning)",
subtitle = "99th percentile shown (extreme outliers removed)",
x = "Number of Characters",
y = "Frequency") +
theme_minimal() +
scale_fill_manual(values = c("#1DA1F2", "#FF9900", "#4CAF50")) +
theme(legend.position = "bottom")# Function to get top N words
get_top_words <- function(text_sample, n = 10) {
# Take sample for performance
text_sample <- text_sample[1:min(5000, length(text_sample))]
# Combine and split
all_words <- unlist(strsplit(paste(text_sample, collapse = " "), "\\s+"))
# Count frequencies
word_counts <- sort(table(all_words), decreasing = TRUE)
# Create dataframe
top_words <- data.frame(
Word = names(word_counts[1:n]),
Frequency = as.numeric(word_counts[1:n])
)
# Reverse for horizontal bar chart
top_words$Word <- factor(top_words$Word, levels = rev(top_words$Word))
return(top_words)
}
# Get top words for each source
twitter_top <- get_top_words(twitter_sample, 10)
blogs_top <- get_top_words(blogs_sample, 10)
news_top <- get_top_words(news_sample, 10)
# Create plots
p_twitter <- ggplot(twitter_top, aes(x = Word, y = Frequency)) +
geom_bar(stat = "identity", fill = "#1DA1F2") +
coord_flip() +
labs(title = "Twitter: Most Common Words",
x = "",
y = "Frequency") +
theme_minimal()
p_blogs <- ggplot(blogs_top, aes(x = Word, y = Frequency)) +
geom_bar(stat = "identity", fill = "#FF9900") +
coord_flip() +
labs(title = "Blogs: Most Common Words",
x = "",
y = "Frequency") +
theme_minimal()
p_news <- ggplot(news_top, aes(x = Word, y = Frequency)) +
geom_bar(stat = "identity", fill = "#4CAF50") +
coord_flip() +
labs(title = "News: Most Common Words",
x = "",
y = "Frequency") +
theme_minimal()
grid.arrange(p_twitter, p_blogs, p_news, ncol = 3)Findings
##
## === INTERESTING FINDINGS (After Light Cleaning) ===
## 1. DATA VOLUME:
## Total words across all sources: 102,061,708
## → Over 100 million words available for training
## 2. LENGTH PATTERNS:
cat(" Twitter average:", full_stats[full_stats$File == "en_US.twitter.txt", "Avg_Words_Per_Line"], "words per line\n")## Twitter average: 12.9 words per line
cat(" Blogs average:", full_stats[full_stats$File == "en_US.blogs.txt", "Avg_Words_Per_Line"], "words per line\n")## Blogs average: 41.5 words per line
cat(" News average:", full_stats[full_stats$File == "en_US.news.txt", "Avg_Words_Per_Line"], "words per line\n")## News average: 34 words per line
## → Blogs and News have much longer, more formal text
## 3. MOST COMMON WORDS:
## Top word in Twitter: 'the'
## Top word in Blogs: 'the'
## Top word in News: 'the'
## → Stop words ('the', 'to', 'and') dominate all sources
## 4. SAMPLE REPRESENTATIVENESS:
cat(" 5% sample provides approximately", round(sum(sample_stats$Sample_Words)/1000, 0), "thousand words\n")## 5% sample provides approximately 5121 thousand words
## → Sufficient for model development while being memory-efficient
Our Approach: Stupid Backoff with N-Grams
We will build a 3-layer
prediction model:
| Layer | Pattern | Coverage | Accuracy |
|---------|------------------------|----------|----------|
| 4-gram | Looks at last 3 words | Low | High |
| 3-gram | Looks at last 2 words | Medium | Medium |
| 2-gram | Looks at last 1 word | High | Low |
| Unigram | Most common word | 100% | Baseline |Heavy Cleaning for Model Building Unlike this exploratory analysis, the actual prediction model will include:
Example Prediction Flow
User types: “I am going to the” → Algorithm finds: “am going to the ___” → Returns: [“store”, “movies”, “gym”]
Planned Features
| Feature | Purpose |
|----------------------|---------------------------------|
| Text input box | User types their message |
| 3 prediction buttons | One-tap word suggestions |
| Word counter | Useful for Twitter users |
| Clear button | Reset conversation |Performance Targets - Prediction speed: <100ms - Top-3 accuracy: >40% - Memory usage: <500MB
| Phase | Duration | Deliverable |
|------------------|----------|----------------------------------|
| Data preparation | 2 days | Cleaned n-gram tables |
| Model building | 3 days | Stupid Backoff implementation |
| Optimization | 2 days | Speed tuning + profanity filter |
| Shiny app | 3 days | Interactive prototype |
| Testing | 2 days | User acceptance |
| Total | 12 days | Production-ready app |Key Takeaways for Management
Data is ready: 100M+ words across 3 sources after light cleaning
Approach is sound: Stupid Backoff with n-grams (industry-proven)
Clear metrics: Accuracy, speed, and user satisfaction
12-day timeline: Working prototype in under 2 weeks