library(slam) # add this line
tdm <- TermDocumentMatrix(corpus)
# Use sparse row_sums instead of converting to a big matrix
word_freq <- sort(row_sums(tdm), decreasing = TRUE)
freq_df <- data.frame(
word = names(word_freq),
freq = as.numeric(word_freq),
row.names = NULL
)
head(freq_df, 10)
## word freq
## 1 the 47584
## 2 and 24097
## 3 for 11118
## 4 that 10409
## 5 you 9221
## 6 with 7230
## 7 was 6273
## 8 this 5318
## 9 have 5268
## 10 but 4827
head(freq_df, 10)
## word freq
## 1 the 47584
## 2 and 24097
## 3 for 11118
## 4 that 10409
## 5 you 9221
## 6 with 7230
## 7 was 6273
## 8 this 5318
## 9 have 5268
## 10 but 4827
### 4.2 Plot of top 20 words
top20 <- head(freq_df, 20)
ggplot(top20, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
xlab("Word") +
ylab("Frequency") +
ggtitle("Top 20 Most Frequent Words") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#5. Plan for the prediction algorithm
#For the final project I plan to build an n-gram based language model using the
# training data:
#
# Construct frequency tables for unigrams (single words), bigrams (two-word
# sequences), and trigrams (three-word sequences) from a cleaned sample of the
# corpus.
#
# When the user types a phrase, use the last one or two words as context and
# search for matching trigrams or bigrams to predict the next word.
#
# Use a backoff strategy: try trigrams first, then bigrams, and finally fall
# back to common unigrams if no higher-order match is found.
#
# Apply simple filtering (for example, removing n-grams that appear only once)
# to reduce noise and keep the model efficient enough for a Shiny app.
#
# The Shiny application will provide an easy-to-use interface for the prediction
# algorithm:
#
# The user interface will contain a text input box and a display area that shows
# suggested next words, and the server will preprocess the input and use the
# n-gram model to generate predictions.
#
# In this milestone report I have demonstrated that the data were loaded,
# summarised, and that I have a plan for the model and Shiny app.