The goal of this capstone project is to build a predictive text model. Given a phrase typed by a user, the model predicts the most likely next word. The final product will be deployed as a Shiny application.
The data consists of three English text files from blogs, news, and Twitter. These files represent different styles of written language.
set.seed(123)
data_dir <- file.path("data", "en_US")
files <- tibble(
source = c("Blogs", "News", "Twitter"),
path = file.path(data_dir, c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"))
)
sample_per_file <- 10000
read_sample <- function(path, n) {
x <- readLines(path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
x <- iconv(x, from = "", to = "UTF-8", sub = " ")
x <- x[nchar(x) > 0]
if (length(x) > n) sample(x, n) else x
}
raw_by_source <- map(files$path, read_sample, n = sample_per_file)
data_summary <- tibble(
source = files$source,
sampled_lines = map_int(raw_by_source, length),
total_characters = map_dbl(raw_by_source, ~ sum(nchar(.x))),
mean_characters_per_line = map_dbl(raw_by_source, ~ mean(nchar(.x)))
)
data_summary
## # A tibble: 3 × 4
## source sampled_lines total_characters mean_characters_per_line
## <chr> <int> <dbl> <dbl>
## 1 Blogs 10000 2297303 230.
## 2 News 10000 2026898 203.
## 3 Twitter 10000 682311 68.2
The text is converted to lowercase, URLs are removed, non-letter symbols are replaced with spaces, and repeated spaces are trimmed.
clean_text <- function(x) {
x %>%
str_to_lower() %>%
str_replace_all("https?://\\S+|www\\.\\S+", " ") %>%
str_replace_all("[^a-z'\\s]", " ") %>%
str_replace_all("\\s+", " ") %>%
str_squish()
}
text_df <- tibble(
source = rep(files$source, map_int(raw_by_source, length)),
text = flatten_chr(raw_by_source)
) %>%
mutate(text = clean_text(text)) %>%
filter(!is.na(text), nchar(text) > 0)
head(text_df)
## # A tibble: 6 × 2
## source text
## <chr> <chr>
## 1 Blogs the bruschetta however missed the mark instead of manageable two bite …
## 2 Blogs walden pond mt rainier big sur everglades and so forth
## 3 Blogs despite laws banning cell phones while driving and increased awareness…
## 4 Blogs ghosts and goblins
## 5 Blogs now i can write in specific post information for each day of the week …
## 6 Blogs but trying to pin photos to muslin walls would be a bit too tricky
unigrams <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
top_unigrams <- unigrams %>% slice_max(n, n = 20)
top_unigrams
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 the 44123
## 2 to 24344
## 3 and 23258
## 4 a 21467
## 5 of 18793
## 6 in 14943
## 7 i 14059
## 8 that 9701
## 9 for 9263
## 10 is 9010
## 11 it 8406
## 12 on 6968
## 13 you 6701
## 14 with 6541
## 15 was 5879
## 16 at 4910
## 17 this 4753
## 18 be 4736
## 19 my 4665
## 20 as 4623
top_unigrams %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Words",
x = "Word",
y = "Frequency"
)
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
top_bigrams <- bigrams %>% slice_max(n, n = 20)
top_bigrams
## # A tibble: 20 × 2
## bigram n
## <chr> <int>
## 1 of the 4014
## 2 in the 3839
## 3 to the 2002
## 4 on the 1764
## 5 for the 1701
## 6 to be 1418
## 7 at the 1246
## 8 and the 1223
## 9 in a 1096
## 10 with the 1007
## 11 is a 915
## 12 it was 890
## 13 from the 845
## 14 of a 820
## 15 and i 797
## 16 i was 790
## 17 for a 778
## 18 with a 771
## 19 i have 735
## 20 it is 728
top_bigrams %>%
mutate(bigram = reorder(bigram, n)) %>%
ggplot(aes(x = bigram, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Bigrams",
x = "Bigram",
y = "Frequency"
)
trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
top_trigrams <- trigrams %>% slice_max(n, n = 20)
top_trigrams
## # A tibble: 20 × 2
## trigram n
## <chr> <int>
## 1 <NA> 1109
## 2 one of the 312
## 3 a lot of 259
## 4 going to be 144
## 5 the end of 144
## 6 it was a 143
## 7 as well as 138
## 8 the u s 136
## 9 to be a 133
## 10 some of the 132
## 11 be able to 125
## 12 out of the 122
## 13 i want to 117
## 14 this is a 116
## 15 part of the 115
## 16 i don t 112
## 17 the rest of 110
## 18 i have to 104
## 19 a couple of 103
## 20 the first time 96
top_trigrams %>%
mutate(trigram = reorder(trigram, n)) %>%
ggplot(aes(x = trigram, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 20 Most Frequent Trigrams",
x = "Trigram",
y = "Frequency"
)
The prediction model will use n-gram frequencies with a backoff strategy. When a user enters a phrase, the algorithm first uses the last three words to search for a matching 4-gram prefix. If no match is found, it backs off to the last two words, then to the last one word. If all prefix searches fail, the model returns the most frequent unigram.
The final product will be a Shiny app where users type a phrase and receive the predicted next word.