library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(ngram)
file_path <- "D:\\Manipal\\Sem4\\Project\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"
text_data <- readLines(file_path, warn = FALSE)
corpus <- paste(text_data, collapse = " ")
clean_text <- corpus %>%
tolower() %>%
str_replace_all("[[:punct:]]", " ") %>%
str_replace_all("\\d+", " ") %>%
str_squish()
substr(clean_text, 1, 150)
## [1] "he wasn t home alone apparently the st louis plant had to close it would die of old age workers had been making cars there since the onset of mass aut"
words <- clean_text %>%
str_split("\\s+") %>%
unlist() %>%
.[nzchar(.)]
word_freq <- as.data.frame(table(words), stringsAsFactors = FALSE)
colnames(word_freq) <- c("Word", "Frequency")
word_freq <- word_freq %>% arrange(desc(Frequency))
print(head(word_freq, 10))
## Word Frequency
## 1 the 151691
## 2 to 69757
## 3 and 68597
## 4 a 68594
## 5 of 59314
## 6 in 51897
## 7 s 34903
## 8 that 28227
## 9 for 27164
## 10 is 21972
bigram_freq <- ngram::ngram(clean_text, n = 2) %>% get.phrasetable() %>% as.data.frame()
print(head(bigram_freq, 10))
## ngrams freq prop
## 1 of the 14166 0.005304112
## 2 in the 13804 0.005168570
## 3 to the 6459 0.002418415
## 4 on the 5570 0.002085550
## 5 for the 5402 0.002022647
## 6 it s 5103 0.001910694
## 7 at the 4538 0.001699143
## 8 in a 4073 0.001525035
## 9 and the 4064 0.001521665
## 10 to be 3576 0.001338946
trigram_freq <- ngram::ngram(clean_text, n = 3) %>% get.phrasetable() %>% as.data.frame()
print(head(trigram_freq, 10))
## ngrams freq prop
## 1 one of the 1079 0.0004040053
## 2 a lot of 877 0.0003283713
## 3 the u s 818 0.0003062802
## 4 it s a 786 0.0002942986
## 5 i don t 691 0.0002587281
## 6 as well as 479 0.0001793499
## 7 it s not 469 0.0001756056
## 8 according to the 435 0.0001628752
## 9 in the first 430 0.0001610030
## 10 going to be 420 0.0001572588
ggplot(word_freq %>% head(20), aes(x = reorder(Word, Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Words", y = "Frequency") +
theme_minimal()
# Visualize 2-gram
ggplot(bigram_freq %>% head(10), aes(x = reorder(ngrams, freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Bigrams", x = "Bigrams", y = "Frequency") +
theme_minimal()
# Visualize 3-gram
ggplot(trigram_freq %>% head(10), aes(x = reorder(ngrams, freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Trigrams", x = "Trigrams", y = "Frequency") +
theme_minimal()
sampled_text <- sample(text_data, size = 10000, replace = FALSE)
corpus_sample <- paste(sampled_text, collapse = " ")
clean_text <- corpus_sample %>%
tolower() %>%
str_replace_all("[[:punct:]]", " ") %>%
str_replace_all("\\d+", " ") %>%
str_squish()
substr(clean_text, 1, 150)
## [1] "in lansing school district the payroll looks a little like a family tree while many patients now seek support through online chat rooms face to face g"
unigram_freq <- ngram::ngram(clean_text, n = 1) %>% get.phrasetable() %>% as.data.frame()
head(unigram_freq, 5)
## ngrams freq prop
## 1 the 19451 0.05676688
## 2 to 8992 0.02624275
## 3 and 8859 0.02585460
## 4 a 8777 0.02561528
## 5 of 7636 0.02228533
bigram_freq <- ngram::ngram(clean_text, n = 2) %>% get.phrasetable() %>% as.data.frame()
head(bigram_freq, 5)
## ngrams freq prop
## 1 of the 1813 0.005291175
## 2 in the 1717 0.005011003
## 3 to the 838 0.002445673
## 4 on the 676 0.001972882
## 5 for the 673 0.001964126
trigram_freq <- ngram::ngram(clean_text, n = 3) %>% get.phrasetable() %>% as.data.frame()
head(trigram_freq, 5)
## ngrams freq prop
## 1 one of the 142 0.0004144231
## 2 the u s 116 0.0003385428
## 3 a lot of 111 0.0003239504
## 4 i don t 101 0.0002947657
## 5 it s a 99 0.0002889288
predict_next_word <- function(input_text, unigram_freq, bigram_freq, trigram_freq) {
# Clean the input text
input_text <- tolower(input_text) %>% str_squish()
input_words <- unlist(str_split(input_text, "\\s+"))
# Get the last two words (for trigram) and the last word (for bigram)
last_bigram <- paste(tail(input_words, 2), collapse = " ")
last_word <- tail(input_words, 1)
# Try trigram prediction
trigram_match <- trigram_freq$ngrams[grepl(paste0("^", last_bigram, " "), trigram_freq$ngrams)]
if (length(trigram_match) > 0) {
return(str_split(trigram_match[1], " ")[[1]][3]) # Return the third word
}
# Try bigram prediction
bigram_match <- bigram_freq$ngrams[grepl(paste0("^", last_word, " "), bigram_freq$ngrams)]
if (length(bigram_match) > 0) {
return(str_split(bigram_match[1], " ")[[1]][2]) # Return the second word
}
# Default to the most common unigram
return(unigram_freq$ngrams[1])
}
apply_laplace_smoothing <- function(freq_table, vocab_size, alpha = 1) {
freq_table <- freq_table %>%
mutate(Smoothed_Prob = (freq + alpha) / (sum(freq) + alpha * vocab_size))
return(freq_table)
}
vocab_size <- length(unigram_freq$ngrams)
bigram_freq <- apply_laplace_smoothing(bigram_freq, vocab_size)
trigram_freq <- apply_laplace_smoothing(trigram_freq, vocab_size)
input_text <- "several"
next_word <- predict_next_word(input_text, unigram_freq, bigram_freq, trigram_freq)
cat("Predicted next word:", next_word, "\n")
## Predicted next word: the