# install.packages(c("stringi","dplyr","tibble","knitr","ggplot2","stringr","tidytext","tidyr"))

library(stringi)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
library(knitr)
library(ggplot2)
library(stringr)
library(tidytext)
library(tidyr)
blogs_path   <- "data_raw/en_US.blogs.txt"
news_path    <- "data_raw/en_US.news.txt"
twitter_path <- "data_raw/en_US.twitter.txt"

c(
  wd = getwd(),
  blogs_exists = file.exists(blogs_path),
  news_exists = file.exists(news_path),
  twitter_exists = file.exists(twitter_path)
)
##                       wd             blogs_exists              news_exists 
## "C:/Users/hrras/Desktop"                   "TRUE"                   "TRUE" 
##           twitter_exists 
##                   "TRUE"
blogs <- readLines(blogs_path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
news  <- readLines(news_path,  encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
tw    <- readLines(twitter_path, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)

c(blogs_lines = length(blogs), news_lines = length(news), twitter_lines = length(tw))
##   blogs_lines    news_lines twitter_lines 
##        899288       1010206       2360148
set.seed(42)

sample_lines <- function(x, n = 50000) {
  x <- x[nzchar(x)]
  if (length(x) <= n) return(x)
  x[sample.int(length(x), n)]
}

blogs_s   <- sample_lines(blogs, n = 50000)
news_s    <- sample_lines(news,  n = 50000)
twitter_s <- sample_lines(tw,    n = 50000)

c(blogs_sample = length(blogs_s), news_sample = length(news_s), twitter_sample = length(twitter_s))
##   blogs_sample    news_sample twitter_sample 
##          50000          50000          50000

Basic Summaries

basic_stats <- function(x) {
  w <- stri_count_words(x)
  tibble(
    lines = length(x),
    words = sum(w, na.rm = TRUE),
    chars = sum(nchar(x), na.rm = TRUE),
    avg_words_per_line = mean(w, na.rm = TRUE),
    median_words_per_line = median(w, na.rm = TRUE)
  )
}

sum_tbl_full <- bind_rows(
  Blogs   = basic_stats(blogs),
  News    = basic_stats(news),
  Twitter = basic_stats(tw),
  .id = "source"
)

kable(sum_tbl_full, digits = 2)
source lines words chars avg_words_per_line median_words_per_line
Blogs 899288 37546806 206824505 41.75 28
News 1010206 34761151 203214543 34.41 32
Twitter 2360148 30096690 162096241 12.75 12

The summary statistics highlight clear differences across the three data sources. Blog entries tend to be the longest, followed by news articles, while Twitter messages are substantially shorter on average. These differences motivate treating the sources separately during preprocessing and favor n-gram–based models that can adapt to varying text lengths.

sum_tbl_sample <- bind_rows(
  Blogs   = basic_stats(blogs_s),
  News    = basic_stats(news_s),
  Twitter = basic_stats(twitter_s),
  .id = "source"
)

kable(sum_tbl_sample, digits = 2)
source lines words chars avg_words_per_line median_words_per_line
Blogs 50000 2086798 11501332 41.74 28
News 50000 1719623 10056678 34.39 32
Twitter 50000 634791 3418013 12.70 12
clean_text <- function(x) {
  x %>%
    str_to_lower() %>%
    str_replace_all("http\\S+|www\\S+", " ") %>%
    str_replace_all("[^a-z\\s']", " ") %>%
    str_squish()
}

all_s <- c(blogs_s, news_s, twitter_s) %>% clean_text()

Plots

# Build df_len here so Knit never misses it
df_len <- tibble(words = stringi::stri_count_words(all_s))

ggplot(df_len, aes(x = words)) +
  geom_histogram(
    binwidth = 2,
    fill = viridis::viridis(1),
    color = "white"
  ) +
  coord_cartesian(xlim = c(0, 100)) +
  labs(
    title = "Distribution of words per line (sample)",
    x = "Words per line",
    y = "Count"
  )

tokens <- tibble(text = all_s) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  count(word, sort = TRUE)

top20 <- tokens %>% slice_max(n, n = 20)

ggplot(top20, aes(x = reorder(word, n), y = n, fill = n)) +
  geom_col() +
  coord_flip() +
  scale_fill_viridis_c(option = "plasma") +
  labs(
    title = "Top 20 words (excluding stopwords)",
    x = "",
    y = "Frequency"
  ) +
  theme(legend.position = "none")

bigrams <- tibble(text = all_s) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("w1","w2"), sep = " ") %>%
  filter(!w1 %in% stop_words$word, !w2 %in% stop_words$word) %>%
  unite(bigram, w1, w2, sep = " ") %>%
  count(bigram, sort = TRUE)

kable(
  head(bigrams, 15),
  caption = "Most Frequent Bigrams in the Sample Corpus"
)
Most Frequent Bigrams in the Sample Corpus
bigram n
NA NA 1403
st louis 484
los angeles 341
san francisco 274
amp amp 262
health care 233
san diego 226
happy birthday 211
social media 179
ice cream 163
weeks ago 157
real estate 155
white house 151
supreme court 145
vice president 144
bigrams <- tibble(text = all_s) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("w1","w2"), sep = " ") %>%
  filter(
    !is.na(w1), !is.na(w2),
    w1 != "", w2 != "",
    !w1 %in% stop_words$word,
    !w2 %in% stop_words$word
  ) %>%
  unite(bigram, w1, w2, sep = " ") %>%
  count(bigram, sort = TRUE)

The most frequent bigrams capture common phrase structures in everyday English. These bigrams provide useful context for predicting the next word when combined with unigram and trigram models using a backoff strategy.

Interesting Findings

The exploratory analysis reveals clear structural differences among the three text sources. Twitter messages are notably shorter than blog posts and news articles, which is consistent with platform-specific constraints. The distribution of words per line is highly right-skewed, indicating that most text entries are short, while a relatively small number of lines contain a large number of words. This imbalance suggests that language modeling approaches must handle sparse but frequent short contexts efficiently.

Plan for Prediction Algorithm

The prediction approach will rely on probabilistic n-gram language models constructed from the cleaned text corpus. Separate frequency tables for unigrams, bigrams, and trigrams will be generated. When predicting the next word, the model will first attempt to match the longest available context and will progressively fall back to shorter contexts if no suitable match is found. Candidate predictions will be ranked based on their observed frequencies within the corpus.

Plan for Shiny App

The Shiny application will be designed as a lightweight interface that allows users to input a partial sentence and receive suggested next-word predictions. The app will present the top candidate words in a clear and concise format. Emphasis will be placed on responsiveness and usability, with all prediction logic relying on precomputed language model components to ensure fast performance.

Next Steps

Future steps include refining text preprocessing decisions, expanding the evaluation of the prediction model using held-out data, and integrating the finalized model into the Shiny application. Additional tuning may be performed to balance model accuracy and computational efficiency.