1. Project Goal

The goal of this capstone project is to build a predictive text model. Given a phrase typed by a user, the model predicts the most likely next word. The final product will be deployed as a Shiny application.

2. Data Source

The data consists of three English text files from blogs, news, and Twitter. These files represent different styles of written language.

set.seed(123)

data_dir <- file.path("data", "en_US")
files <- tibble(
  source = c("Blogs", "News", "Twitter"),
  path = file.path(data_dir, c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"))
)

sample_per_file <- 10000

read_sample <- function(path, n) {
  x <- readLines(path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
  x <- iconv(x, from = "", to = "UTF-8", sub = " ")
  x <- x[nchar(x) > 0]
  if (length(x) > n) sample(x, n) else x
}

raw_by_source <- map(files$path, read_sample, n = sample_per_file)

3. Basic Data Summary

data_summary <- tibble(
  source = files$source,
  sampled_lines = map_int(raw_by_source, length),
  total_characters = map_dbl(raw_by_source, ~ sum(nchar(.x))),
  mean_characters_per_line = map_dbl(raw_by_source, ~ mean(nchar(.x)))
)

data_summary
## # A tibble: 3 × 4
##   source  sampled_lines total_characters mean_characters_per_line
##   <chr>           <int>            <dbl>                    <dbl>
## 1 Blogs           10000          2297303                    230. 
## 2 News            10000          2026898                    203. 
## 3 Twitter         10000           682311                     68.2

4. Text Cleaning

The text is converted to lowercase, URLs are removed, non-letter symbols are replaced with spaces, and repeated spaces are trimmed.

clean_text <- function(x) {
  x %>%
    str_to_lower() %>%
    str_replace_all("https?://\\S+|www\\.\\S+", " ") %>%
    str_replace_all("[^a-z'\\s]", " ") %>%
    str_replace_all("\\s+", " ") %>%
    str_squish()
}

text_df <- tibble(
  source = rep(files$source, map_int(raw_by_source, length)),
  text = flatten_chr(raw_by_source)
) %>%
  mutate(text = clean_text(text)) %>%
  filter(!is.na(text), nchar(text) > 0)

head(text_df)
## # A tibble: 6 × 2
##   source text                                                                   
##   <chr>  <chr>                                                                  
## 1 Blogs  the bruschetta however missed the mark instead of manageable two bite …
## 2 Blogs  walden pond mt rainier big sur everglades and so forth                 
## 3 Blogs  despite laws banning cell phones while driving and increased awareness…
## 4 Blogs  ghosts and goblins                                                     
## 5 Blogs  now i can write in specific post information for each day of the week …
## 6 Blogs  but trying to pin photos to muslin walls would be a bit too tricky

5. Most Frequent Words

unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

top_unigrams <- unigrams %>% slice_max(n, n = 20)

top_unigrams
## # A tibble: 20 × 2
##    word      n
##    <chr> <int>
##  1 the   44123
##  2 to    24344
##  3 and   23258
##  4 a     21467
##  5 of    18793
##  6 in    14943
##  7 i     14059
##  8 that   9701
##  9 for    9263
## 10 is     9010
## 11 it     8406
## 12 on     6968
## 13 you    6701
## 14 with   6541
## 15 was    5879
## 16 at     4910
## 17 this   4753
## 18 be     4736
## 19 my     4665
## 20 as     4623
top_unigrams %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Words",
    x = "Word",
    y = "Frequency"
  )

6. Most Frequent Bigrams

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

top_bigrams <- bigrams %>% slice_max(n, n = 20)

top_bigrams
## # A tibble: 20 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the    4014
##  2 in the    3839
##  3 to the    2002
##  4 on the    1764
##  5 for the   1701
##  6 to be     1418
##  7 at the    1246
##  8 and the   1223
##  9 in a      1096
## 10 with the  1007
## 11 is a       915
## 12 it was     890
## 13 from the   845
## 14 of a       820
## 15 and i      797
## 16 i was      790
## 17 for a      778
## 18 with a     771
## 19 i have     735
## 20 it is      728
top_bigrams %>%
  mutate(bigram = reorder(bigram, n)) %>%
  ggplot(aes(x = bigram, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Bigrams",
    x = "Bigram",
    y = "Frequency"
  )

7. Most Frequent Trigrams

trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

top_trigrams <- trigrams %>% slice_max(n, n = 20)

top_trigrams
## # A tibble: 20 × 2
##    trigram            n
##    <chr>          <int>
##  1 <NA>            1109
##  2 one of the       312
##  3 a lot of         259
##  4 going to be      144
##  5 the end of       144
##  6 it was a         143
##  7 as well as       138
##  8 the u s          136
##  9 to be a          133
## 10 some of the      132
## 11 be able to       125
## 12 out of the       122
## 13 i want to        117
## 14 this is a        116
## 15 part of the      115
## 16 i don t          112
## 17 the rest of      110
## 18 i have to        104
## 19 a couple of      103
## 20 the first time    96
top_trigrams %>%
  mutate(trigram = reorder(trigram, n)) %>%
  ggplot(aes(x = trigram, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Trigrams",
    x = "Trigram",
    y = "Frequency"
  )

8. Modeling Plan

The prediction model will use n-gram frequencies with a backoff strategy. When a user enters a phrase, the algorithm first uses the last three words to search for a matching 4-gram prefix. If no match is found, it backs off to the last two words, then to the last one word. If all prefix searches fail, the model returns the most frequent unigram.

The final product will be a Shiny app where users type a phrase and receive the predicted next word.