This project develops a predictive typing assistant using text data from blogs, news, and Twitter. Our solution will help users type faster by predicting likely next words, similar to smartphone keyboard suggestions.
We analyzed 3 English text sources:
setwd("/Users/xav_21/Desktop/Data_Science_R/coursera/final/en_US/")
files <- c("en_US.blogs.txt",
"en_US.news.txt",
"en_US.twitter.txt")
# Create stats data frame properly
stats <- tibble(
Source = c("Blogs", "News", "Twitter"),
`Size (MB)` = round(file.size(files)/(1024^2), 1),
`Entries` = sapply(files, function(f) length(readLines(f, warn = FALSE))),
`Total Words` = sapply(files, function(f) sum(str_count(readLines(f, warn = FALSE), "\\S+"))),
`Longest Entry` = sapply(files, function(f) {
max_len <- max(nchar(readLines(f, warn = FALSE)))
format(max_len, big.mark = ",")
})
)
stats %>%
kable(align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
add_header_above(c("Text Source Summary" = 5))
| Source | Size (MB) | Entries | Total Words | Longest Entry |
|---|---|---|---|---|
| Blogs | 200.4 | 899288 | 37334131 | 40,833 |
| News | 196.3 | 1010242 | 34372530 | 11,384 |
| 159.4 | 2360148 | 30373543 | 140 |
set.seed(123)
combined <- map_df(files, ~{
# First read all lines
all_lines <- readLines(.x, warn = FALSE)
# Then sample from the lines
sampled_lines <- sample(all_lines, size = min(5000, length(all_lines)))
# Create tibble
tibble(
text = sampled_lines,
source = str_extract(.x, "[A-Za-z]+(?=\\.)")
)
}) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Top words plot
p1 <- combined %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "#3498db") +
coord_flip() +
labs(title = "Top 20 Words", x = "", y = "Count")
# Display the plot
p1
# Step 1: Prepare the text data
text_samples <- map_df(files, ~{
# Read a sample of 5,000 lines from each file
readLines(.x, warn = FALSE, encoding = "UTF-8") %>%
sample(size = min(5000, length(.))) %>%
tibble(
text = .,
source = str_replace(.x, ".*en_US\\.(.*?)\\.txt", "\\1")
)
})
# Step 2: Find common word pairs
common_pairs <- text_samples %>%
# Extract all 2-word phrases
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
# Split into separate words
separate(bigram, c("word1", "word2"), sep = " ") %>%
# Remove common words and missing values
filter(
!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!is.na(word1),
!is.na(word2)
) %>%
# Count how often each pair appears
count(word1, word2, sort = TRUE) %>%
# Keep only pairs appearing at least 20 times
filter(n >= 20)
# Step 3: Visualize the network
if (nrow(common_pairs) > 0) {
# Create connections between words
word_network <- common_pairs %>%
graph_from_data_frame()
# Create the visualization
set.seed(123) # Makes layout consistent
ggraph(word_network, layout = "fr") +
# Draw lines between connected words
geom_edge_link(aes(width = n), color = "gray70", alpha = 0.7) +
# Add points for each word
geom_node_point(color = "dodgerblue", size = 4) +
# Add word labels
geom_node_text(aes(label = name), repel = TRUE, size = 5) +
# Clean up the background
theme_void() +
# Add titles
labs(
title = "Frequently Occurring Word Pairs",
subtitle = "Each line shows words that often appear together",
caption = "Only pairs appearing 20+ times shown"
) +
# Adjust appearance
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(color = "gray40")
)
} else {
message("Not enough common word pairs found to create visualization")
}
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.