library(slam)  # add this line

tdm <- TermDocumentMatrix(corpus)

# Use sparse row_sums instead of converting to a big matrix
word_freq <- sort(row_sums(tdm), decreasing = TRUE)

freq_df <- data.frame(
  word = names(word_freq),
  freq = as.numeric(word_freq),
  row.names = NULL
)

head(freq_df, 10)
##    word  freq
## 1   the 47584
## 2   and 24097
## 3   for 11118
## 4  that 10409
## 5   you  9221
## 6  with  7230
## 7   was  6273
## 8  this  5318
## 9  have  5268
## 10  but  4827
head(freq_df, 10)
##    word  freq
## 1   the 47584
## 2   and 24097
## 3   for 11118
## 4  that 10409
## 5   you  9221
## 6  with  7230
## 7   was  6273
## 8  this  5318
## 9  have  5268
## 10  but  4827
### 4.2 Plot of top 20 words
top20 <- head(freq_df, 20)

ggplot(top20, aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat = "identity") +
  xlab("Word") +
  ylab("Frequency") +
  ggtitle("Top 20 Most Frequent Words") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#5. Plan for the prediction algorithm

#For the final project I plan to build an n-gram based language model using the
# training data:
#
# Construct frequency tables for unigrams (single words), bigrams (two-word
# sequences), and trigrams (three-word sequences) from a cleaned sample of the
# corpus.
#
# When the user types a phrase, use the last one or two words as context and
# search for matching trigrams or bigrams to predict the next word.
#
# Use a backoff strategy: try trigrams first, then bigrams, and finally fall
# back to common unigrams if no higher-order match is found.
#
# Apply simple filtering (for example, removing n-grams that appear only once)
# to reduce noise and keep the model efficient enough for a Shiny app.
#
# The Shiny application will provide an easy-to-use interface for the prediction
# algorithm:
#
# The user interface will contain a text input box and a display area that shows
# suggested next words, and the server will preprocess the input and use the
# n-gram model to generate predictions.
#
# In this milestone report I have demonstrated that the data were loaded,
# summarised, and that I have a plan for the model and Shiny app.