blogs <- readLines("en_US.blogs.txt", n = 10000)
## Warning in readLines("en_US.blogs.txt", n = 10000): incomplete final line found
## on 'en_US.blogs.txt'
news <- readLines("en_US.news.txt", n = 10000)
## Warning in readLines("en_US.news.txt", n = 10000): incomplete final line found
## on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", n = 10000)
## Warning in readLines("en_US.twitter.txt", n = 10000): incomplete final line
## found on 'en_US.twitter.txt'
blogs_char <- nchar(blogs)
ggplot(data.frame(length=blogs_char), aes(x=length)) +
geom_histogram(binwidth=50, fill="blue", color="black") +
labs(title="Distribution of line lengths in Blogs", x="Number of characters", y="Frequency")
blogs_words <- tibble(text = blogs) %>%
unnest_tokens(word, text)
blogs_words %>%
count(word, sort=TRUE) %>%
top_n(10)
## Selecting by n
## # A tibble: 1 × 2
## word n
## <chr> <int>
## 1 test1 1
blogs_char <- nchar(blogs)
ggplot(data.frame(length=blogs_char), aes(x=length)) +
geom_histogram(binwidth=50, fill="blue", color="black") +
labs(title="Distribution of line lengths in Blogs", x="Number of characters", y="Frequency")
blogs_words <- tibble(text = blogs) %>%
unnest_tokens(word, text)
blogs_words %>%
count(word, sort=TRUE) %>%
top_n(10)
## Selecting by n
## # A tibble: 1 × 2
## word n
## <chr> <int>
## 1 test1 1
cat("Next steps:\n",
"1. Build n-gram models (unigram, bigram, trigram) from tokenized words.\n",
"2. Handle unseen n-grams using smoothing and backoff techniques.\n",
"3. Evaluate model performance with test phrases.\n",
"4. Create a Shiny app that predicts the next word for a given input phrase.")
## Next steps:
## 1. Build n-gram models (unigram, bigram, trigram) from tokenized words.
## 2. Handle unseen n-grams using smoothing and backoff techniques.
## 3. Evaluate model performance with test phrases.
## 4. Create a Shiny app that predicts the next word for a given input phrase.
```