Introduction

This project develops a predictive typing assistant using text data from blogs, news, and Twitter. Our solution will help users type faster by predicting likely next words, similar to smartphone keyboard suggestions.

Data Overview

We analyzed 3 English text sources:

setwd("/Users/xav_21/Desktop/Data_Science_R/coursera/final/en_US/")
files <- c("en_US.blogs.txt", 
           "en_US.news.txt", 
           "en_US.twitter.txt")

# Create stats data frame properly
stats <- tibble(
  Source = c("Blogs", "News", "Twitter"),
  `Size (MB)` = round(file.size(files)/(1024^2), 1),
  `Entries` = sapply(files, function(f) length(readLines(f, warn = FALSE))),
  `Total Words` = sapply(files, function(f) sum(str_count(readLines(f, warn = FALSE), "\\S+"))),
  `Longest Entry` = sapply(files, function(f) {
    max_len <- max(nchar(readLines(f, warn = FALSE)))
    format(max_len, big.mark = ",")
  })
)

stats %>%
  kable(align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_header_above(c("Text Source Summary" = 5))

Text Source Summary
Source	Size (MB)	Entries	Total Words	Longest Entry
Blogs	200.4	899288	37334131	40,833
News	196.3	1010242	34372530	11,384
Twitter	159.4	2360148	30373543	140

Key Observations:

Twitter has the most entries but shortest text
Blogs contain novel-length posts (max 40,835 characters)
News articles are most consistent in length

Text Analysis Highlights

Word Frequency Patterns

set.seed(123)
combined <- map_df(files, ~{
  # First read all lines
  all_lines <- readLines(.x, warn = FALSE)
  # Then sample from the lines
  sampled_lines <- sample(all_lines, size = min(5000, length(all_lines)))
  # Create tibble
  tibble(
    text = sampled_lines,
    source = str_extract(.x, "[A-Za-z]+(?=\\.)")
  )
}) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

# Top words plot
p1 <- combined %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "#3498db") +
  coord_flip() +
  labs(title = "Top 20 Words", x = "", y = "Count")

# Display the plot
p1

Common Phrases

# Step 1: Prepare the text data
text_samples <- map_df(files, ~{
  # Read a sample of 5,000 lines from each file
  readLines(.x, warn = FALSE, encoding = "UTF-8") %>%
    sample(size = min(5000, length(.))) %>%
    tibble(
      text = .,
      source = str_replace(.x, ".*en_US\\.(.*?)\\.txt", "\\1")
    )
})

# Step 2: Find common word pairs
common_pairs <- text_samples %>%
  # Extract all 2-word phrases
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split into separate words
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  # Remove common words and missing values
  filter(
    !word1 %in% stop_words$word,
    !word2 %in% stop_words$word,
    !is.na(word1), 
    !is.na(word2)
  ) %>%
  # Count how often each pair appears
  count(word1, word2, sort = TRUE) %>%
  # Keep only pairs appearing at least 20 times
  filter(n >= 20)

# Step 3: Visualize the network
if (nrow(common_pairs) > 0) {
  # Create connections between words
  word_network <- common_pairs %>%
    graph_from_data_frame()
  
  # Create the visualization
  set.seed(123) # Makes layout consistent
  ggraph(word_network, layout = "fr") + 
    # Draw lines between connected words
    geom_edge_link(aes(width = n), color = "gray70", alpha = 0.7) +
    # Add points for each word
    geom_node_point(color = "dodgerblue", size = 4) +
    # Add word labels
    geom_node_text(aes(label = name), repel = TRUE, size = 5) +
    # Clean up the background
    theme_void() +
    # Add titles
    labs(
      title = "Frequently Occurring Word Pairs",
      subtitle = "Each line shows words that often appear together",
      caption = "Only pairs appearing 20+ times shown"
    ) +
    # Adjust appearance
    theme(
      plot.title = element_text(face = "bold", size = 18),
      plot.subtitle = element_text(color = "gray40")
    )
} else {
  message("Not enough common word pairs found to create visualization")
}

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Predictive Text Model: Milestone Report

Xavier Rodriguez

2025-04-09

Introduction

Data Overview

Key Observations:

Text Analysis Highlights

Word Frequency Patterns

Common Phrases

Key Findings

Language Patterns:

Source Differences: