blogs <- readLines("en_US.blogs.txt", n = 10000)
## Warning in readLines("en_US.blogs.txt", n = 10000): incomplete final line found
## on 'en_US.blogs.txt'
news <- readLines("en_US.news.txt", n = 10000)
## Warning in readLines("en_US.news.txt", n = 10000): incomplete final line found
## on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", n = 10000)
## Warning in readLines("en_US.twitter.txt", n = 10000): incomplete final line
## found on 'en_US.twitter.txt'
blogs_char <- nchar(blogs)
ggplot(data.frame(length=blogs_char), aes(x=length)) +
  geom_histogram(binwidth=50, fill="blue", color="black") +
  labs(title="Distribution of line lengths in Blogs", x="Number of characters", y="Frequency")

blogs_words <- tibble(text = blogs) %>%
  unnest_tokens(word, text)

blogs_words %>%
  count(word, sort=TRUE) %>%
  top_n(10)
## Selecting by n
## # A tibble: 1 × 2
##   word      n
##   <chr> <int>
## 1 test1     1
blogs_char <- nchar(blogs)
ggplot(data.frame(length=blogs_char), aes(x=length)) +
  geom_histogram(binwidth=50, fill="blue", color="black") +
  labs(title="Distribution of line lengths in Blogs", x="Number of characters", y="Frequency")

blogs_words <- tibble(text = blogs) %>%
  unnest_tokens(word, text)

blogs_words %>%
  count(word, sort=TRUE) %>%
  top_n(10)
## Selecting by n
## # A tibble: 1 × 2
##   word      n
##   <chr> <int>
## 1 test1     1
cat("Next steps:\n",
    "1. Build n-gram models (unigram, bigram, trigram) from tokenized words.\n",
    "2. Handle unseen n-grams using smoothing and backoff techniques.\n",
    "3. Evaluate model performance with test phrases.\n",
    "4. Create a Shiny app that predicts the next word for a given input phrase.")
## Next steps:
##  1. Build n-gram models (unigram, bigram, trigram) from tokenized words.
##  2. Handle unseen n-grams using smoothing and backoff techniques.
##  3. Evaluate model performance with test phrases.
##  4. Create a Shiny app that predicts the next word for a given input phrase.

```