# install.packages(c("stringi","dplyr","tibble","knitr","ggplot2","stringr","tidytext","tidyr"))
library(stringi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(knitr)
library(ggplot2)
library(stringr)
library(tidytext)
library(tidyr)
blogs_path <- "data_raw/en_US.blogs.txt"
news_path <- "data_raw/en_US.news.txt"
twitter_path <- "data_raw/en_US.twitter.txt"
c(
wd = getwd(),
blogs_exists = file.exists(blogs_path),
news_exists = file.exists(news_path),
twitter_exists = file.exists(twitter_path)
)
## wd blogs_exists news_exists
## "C:/Users/hrras/Desktop" "TRUE" "TRUE"
## twitter_exists
## "TRUE"
blogs <- readLines(blogs_path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
news <- readLines(news_path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
tw <- readLines(twitter_path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
c(blogs_lines = length(blogs), news_lines = length(news), twitter_lines = length(tw))
## blogs_lines news_lines twitter_lines
## 899288 1010206 2360148
set.seed(42)
sample_lines <- function(x, n = 50000) {
x <- x[nzchar(x)]
if (length(x) <= n) return(x)
x[sample.int(length(x), n)]
}
blogs_s <- sample_lines(blogs, n = 50000)
news_s <- sample_lines(news, n = 50000)
twitter_s <- sample_lines(tw, n = 50000)
c(blogs_sample = length(blogs_s), news_sample = length(news_s), twitter_sample = length(twitter_s))
## blogs_sample news_sample twitter_sample
## 50000 50000 50000
basic_stats <- function(x) {
w <- stri_count_words(x)
tibble(
lines = length(x),
words = sum(w, na.rm = TRUE),
chars = sum(nchar(x), na.rm = TRUE),
avg_words_per_line = mean(w, na.rm = TRUE),
median_words_per_line = median(w, na.rm = TRUE)
)
}
sum_tbl_full <- bind_rows(
Blogs = basic_stats(blogs),
News = basic_stats(news),
Twitter = basic_stats(tw),
.id = "source"
)
kable(sum_tbl_full, digits = 2)
| source | lines | words | chars | avg_words_per_line | median_words_per_line |
|---|---|---|---|---|---|
| Blogs | 899288 | 37546806 | 206824505 | 41.75 | 28 |
| News | 1010206 | 34761151 | 203214543 | 34.41 | 32 |
| 2360148 | 30096690 | 162096241 | 12.75 | 12 |
The summary statistics highlight clear differences across the three data sources. Blog entries tend to be the longest, followed by news articles, while Twitter messages are substantially shorter on average. These differences motivate treating the sources separately during preprocessing and favor n-gram–based models that can adapt to varying text lengths.
sum_tbl_sample <- bind_rows(
Blogs = basic_stats(blogs_s),
News = basic_stats(news_s),
Twitter = basic_stats(twitter_s),
.id = "source"
)
kable(sum_tbl_sample, digits = 2)
| source | lines | words | chars | avg_words_per_line | median_words_per_line |
|---|---|---|---|---|---|
| Blogs | 50000 | 2086798 | 11501332 | 41.74 | 28 |
| News | 50000 | 1719623 | 10056678 | 34.39 | 32 |
| 50000 | 634791 | 3418013 | 12.70 | 12 |
clean_text <- function(x) {
x %>%
str_to_lower() %>%
str_replace_all("http\\S+|www\\S+", " ") %>%
str_replace_all("[^a-z\\s']", " ") %>%
str_squish()
}
all_s <- c(blogs_s, news_s, twitter_s) %>% clean_text()
# Build df_len here so Knit never misses it
df_len <- tibble(words = stringi::stri_count_words(all_s))
ggplot(df_len, aes(x = words)) +
geom_histogram(
binwidth = 2,
fill = viridis::viridis(1),
color = "white"
) +
coord_cartesian(xlim = c(0, 100)) +
labs(
title = "Distribution of words per line (sample)",
x = "Words per line",
y = "Count"
)
tokens <- tibble(text = all_s) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word, sort = TRUE)
top20 <- tokens %>% slice_max(n, n = 20)
ggplot(top20, aes(x = reorder(word, n), y = n, fill = n)) +
geom_col() +
coord_flip() +
scale_fill_viridis_c(option = "plasma") +
labs(
title = "Top 20 words (excluding stopwords)",
x = "",
y = "Frequency"
) +
theme(legend.position = "none")
bigrams <- tibble(text = all_s) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("w1","w2"), sep = " ") %>%
filter(!w1 %in% stop_words$word, !w2 %in% stop_words$word) %>%
unite(bigram, w1, w2, sep = " ") %>%
count(bigram, sort = TRUE)
kable(
head(bigrams, 15),
caption = "Most Frequent Bigrams in the Sample Corpus"
)
| bigram | n |
|---|---|
| NA NA | 1403 |
| st louis | 484 |
| los angeles | 341 |
| san francisco | 274 |
| amp amp | 262 |
| health care | 233 |
| san diego | 226 |
| happy birthday | 211 |
| social media | 179 |
| ice cream | 163 |
| weeks ago | 157 |
| real estate | 155 |
| white house | 151 |
| supreme court | 145 |
| vice president | 144 |
bigrams <- tibble(text = all_s) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("w1","w2"), sep = " ") %>%
filter(
!is.na(w1), !is.na(w2),
w1 != "", w2 != "",
!w1 %in% stop_words$word,
!w2 %in% stop_words$word
) %>%
unite(bigram, w1, w2, sep = " ") %>%
count(bigram, sort = TRUE)
The most frequent bigrams capture common phrase structures in everyday English. These bigrams provide useful context for predicting the next word when combined with unigram and trigram models using a backoff strategy.
The exploratory analysis reveals clear structural differences among the three text sources. Twitter messages are notably shorter than blog posts and news articles, which is consistent with platform-specific constraints. The distribution of words per line is highly right-skewed, indicating that most text entries are short, while a relatively small number of lines contain a large number of words. This imbalance suggests that language modeling approaches must handle sparse but frequent short contexts efficiently.
The prediction approach will rely on probabilistic n-gram language models constructed from the cleaned text corpus. Separate frequency tables for unigrams, bigrams, and trigrams will be generated. When predicting the next word, the model will first attempt to match the longest available context and will progressively fall back to shorter contexts if no suitable match is found. Candidate predictions will be ranked based on their observed frequencies within the corpus.
The Shiny application will be designed as a lightweight interface that allows users to input a partial sentence and receive suggested next-word predictions. The app will present the top candidate words in a clear and concise format. Emphasis will be placed on responsiveness and usability, with all prediction logic relying on precomputed language model components to ensure fast performance.
Future steps include refining text preprocessing decisions, expanding the evaluation of the prediction model using held-out data, and integrating the finalized model into the Shiny application. Additional tuning may be performed to balance model accuracy and computational efficiency.