Introduction

This project develops a predictive typing assistant using text data from blogs, news, and Twitter. Our solution will help users type faster by predicting likely next words, similar to smartphone keyboard suggestions.

Data Overview

We analyzed 3 English text sources:

setwd("/Users/xav_21/Desktop/Data_Science_R/coursera/final/en_US/")
files <- c("en_US.blogs.txt", 
           "en_US.news.txt", 
           "en_US.twitter.txt")

# Create stats data frame properly
stats <- tibble(
  Source = c("Blogs", "News", "Twitter"),
  `Size (MB)` = round(file.size(files)/(1024^2), 1),
  `Entries` = sapply(files, function(f) length(readLines(f, warn = FALSE))),
  `Total Words` = sapply(files, function(f) sum(str_count(readLines(f, warn = FALSE), "\\S+"))),
  `Longest Entry` = sapply(files, function(f) {
    max_len <- max(nchar(readLines(f, warn = FALSE)))
    format(max_len, big.mark = ",")
  })
)

stats %>%
  kable(align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_header_above(c("Text Source Summary" = 5))
Text Source Summary
Source Size (MB) Entries Total Words Longest Entry
Blogs 200.4 899288 37334131 40,833
News 196.3 1010242 34372530 11,384
Twitter 159.4 2360148 30373543 140

Key Observations:

  • Twitter has the most entries but shortest text
  • Blogs contain novel-length posts (max 40,835 characters)
  • News articles are most consistent in length

Text Analysis Highlights

Word Frequency Patterns

set.seed(123)
combined <- map_df(files, ~{
  # First read all lines
  all_lines <- readLines(.x, warn = FALSE)
  # Then sample from the lines
  sampled_lines <- sample(all_lines, size = min(5000, length(all_lines)))
  # Create tibble
  tibble(
    text = sampled_lines,
    source = str_extract(.x, "[A-Za-z]+(?=\\.)")
  )
}) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Top words plot
p1 <- combined %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "#3498db") +
  coord_flip() +
  labs(title = "Top 20 Words", x = "", y = "Count")

# Display the plot
p1

Common Phrases

# Step 1: Prepare the text data
text_samples <- map_df(files, ~{
  # Read a sample of 5,000 lines from each file
  readLines(.x, warn = FALSE, encoding = "UTF-8") %>%
    sample(size = min(5000, length(.))) %>%
    tibble(
      text = .,
      source = str_replace(.x, ".*en_US\\.(.*?)\\.txt", "\\1")
    )
})

# Step 2: Find common word pairs
common_pairs <- text_samples %>%
  # Extract all 2-word phrases
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split into separate words
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  # Remove common words and missing values
  filter(
    !word1 %in% stop_words$word,
    !word2 %in% stop_words$word,
    !is.na(word1), 
    !is.na(word2)
  ) %>%
  # Count how often each pair appears
  count(word1, word2, sort = TRUE) %>%
  # Keep only pairs appearing at least 20 times
  filter(n >= 20)

# Step 3: Visualize the network
if (nrow(common_pairs) > 0) {
  # Create connections between words
  word_network <- common_pairs %>%
    graph_from_data_frame()
  
  # Create the visualization
  set.seed(123) # Makes layout consistent
  ggraph(word_network, layout = "fr") + 
    # Draw lines between connected words
    geom_edge_link(aes(width = n), color = "gray70", alpha = 0.7) +
    # Add points for each word
    geom_node_point(color = "dodgerblue", size = 4) +
    # Add word labels
    geom_node_text(aes(label = name), repel = TRUE, size = 5) +
    # Clean up the background
    theme_void() +
    # Add titles
    labs(
      title = "Frequently Occurring Word Pairs",
      subtitle = "Each line shows words that often appear together",
      caption = "Only pairs appearing 20+ times shown"
    ) +
    # Adjust appearance
    theme(
      plot.title = element_text(face = "bold", size = 18),
      plot.subtitle = element_text(color = "gray40")
    )
} else {
  message("Not enough common word pairs found to create visualization")
}
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Key Findings

Language Patterns:

  • “Love” appears 4.1x more than “hate” in Twitter data
  • 3 identical tweets about computers vs. kickboxing
  • 50% of text uses just ~100 common words

Source Differences:

  • Blogs use more personal pronouns (I, you)
  • News contains more proper nouns
  • Twitter shows abbreviations (u, rt)