library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.3

library(ngram)

Defining the file path

file_path <- "D:\\Manipal\\Sem4\\Project\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"

Reading the file

text_data <- readLines(file_path, warn = FALSE)
corpus <- paste(text_data, collapse = " ")

Clean the text data

clean_text <- corpus %>%
  tolower() %>%
  str_replace_all("[[:punct:]]", " ") %>%
  str_replace_all("\\d+", " ") %>%
  str_squish()
  substr(clean_text, 1, 150)

## [1] "he wasn t home alone apparently the st louis plant had to close it would die of old age workers had been making cars there since the onset of mass aut"

Tokenize into words

words <- clean_text %>%
  str_split("\\s+") %>%
  unlist() %>%
  .[nzchar(.)]

Calculate word frequencies

word_freq <- as.data.frame(table(words), stringsAsFactors = FALSE)
colnames(word_freq) <- c("Word", "Frequency")
word_freq <- word_freq %>% arrange(desc(Frequency))

print(head(word_freq, 10))

##    Word Frequency
## 1   the    151691
## 2    to     69757
## 3   and     68597
## 4     a     68594
## 5    of     59314
## 6    in     51897
## 7     s     34903
## 8  that     28227
## 9   for     27164
## 10   is     21972

Generating 2-grams

bigram_freq <- ngram::ngram(clean_text, n = 2) %>% get.phrasetable() %>% as.data.frame()
print(head(bigram_freq, 10))

##      ngrams  freq        prop
## 1   of the  14166 0.005304112
## 2   in the  13804 0.005168570
## 3   to the   6459 0.002418415
## 4   on the   5570 0.002085550
## 5  for the   5402 0.002022647
## 6     it s   5103 0.001910694
## 7   at the   4538 0.001699143
## 8     in a   4073 0.001525035
## 9  and the   4064 0.001521665
## 10   to be   3576 0.001338946

Generating 3-grams

trigram_freq <- ngram::ngram(clean_text, n = 3) %>% get.phrasetable() %>% as.data.frame()
print(head(trigram_freq, 10))

##               ngrams freq         prop
## 1        one of the  1079 0.0004040053
## 2          a lot of   877 0.0003283713
## 3           the u s   818 0.0003062802
## 4            it s a   786 0.0002942986
## 5           i don t   691 0.0002587281
## 6        as well as   479 0.0001793499
## 7          it s not   469 0.0001756056
## 8  according to the   435 0.0001628752
## 9      in the first   430 0.0001610030
## 10      going to be   420 0.0001572588

Visualize word frequency distribution

ggplot(word_freq %>% head(20), aes(x = reorder(Word, Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words", x = "Words", y = "Frequency") +
  theme_minimal()

# Visualize 2-gram

ggplot(bigram_freq %>% head(10), aes(x = reorder(ngrams, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Bigrams", x = "Bigrams", y = "Frequency") +
  theme_minimal()

# Visualize 3-gram

ggplot(trigram_freq %>% head(10), aes(x = reorder(ngrams, freq), y = freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Trigrams", x = "Trigrams", y = "Frequency") +
  theme_minimal()

To make prediction taking small sample to reduce the processing time.

sampled_text <- sample(text_data, size = 10000, replace = FALSE)
corpus_sample <- paste(sampled_text, collapse = " ")

Continue with cleaning and n-gram generation on the sampled text

clean_text <- corpus_sample %>%
  tolower() %>%
  str_replace_all("[[:punct:]]", " ") %>%
  str_replace_all("\\d+", " ") %>%
  str_squish()
  substr(clean_text, 1, 150)

## [1] "in lansing school district the payroll looks a little like a family tree while many patients now seek support through online chat rooms face to face g"

Generate unigrams

unigram_freq <- ngram::ngram(clean_text, n = 1) %>% get.phrasetable() %>% as.data.frame()
head(unigram_freq, 5)

##   ngrams  freq       prop
## 1   the  19451 0.05676688
## 2    to   8992 0.02624275
## 3   and   8859 0.02585460
## 4     a   8777 0.02561528
## 5    of   7636 0.02228533

Generate bigrams

bigram_freq <- ngram::ngram(clean_text, n = 2) %>% get.phrasetable() %>% as.data.frame()
head(bigram_freq, 5)

##     ngrams freq        prop
## 1  of the  1813 0.005291175
## 2  in the  1717 0.005011003
## 3  to the   838 0.002445673
## 4  on the   676 0.001972882
## 5 for the   673 0.001964126

Generate trigrams

trigram_freq <- ngram::ngram(clean_text, n = 3) %>% get.phrasetable() %>% as.data.frame()
head(trigram_freq, 5)

##        ngrams freq         prop
## 1 one of the   142 0.0004144231
## 2    the u s   116 0.0003385428
## 3   a lot of   111 0.0003239504
## 4    i don t   101 0.0002947657
## 5     it s a    99 0.0002889288

Prediction function

predict_next_word <- function(input_text, unigram_freq, bigram_freq, trigram_freq) {
  # Clean the input text
  input_text <- tolower(input_text) %>% str_squish()
  input_words <- unlist(str_split(input_text, "\\s+"))
  
  # Get the last two words (for trigram) and the last word (for bigram)
  last_bigram <- paste(tail(input_words, 2), collapse = " ")
  last_word <- tail(input_words, 1)
  
  # Try trigram prediction
  trigram_match <- trigram_freq$ngrams[grepl(paste0("^", last_bigram, " "), trigram_freq$ngrams)]
  if (length(trigram_match) > 0) {
    return(str_split(trigram_match[1], " ")[[1]][3])  # Return the third word
  }
  
  # Try bigram prediction
  bigram_match <- bigram_freq$ngrams[grepl(paste0("^", last_word, " "), bigram_freq$ngrams)]
  if (length(bigram_match) > 0) {
    return(str_split(bigram_match[1], " ")[[1]][2])  # Return the second word
  }
  
  # Default to the most common unigram
  return(unigram_freq$ngrams[1])
}

apply_laplace_smoothing <- function(freq_table, vocab_size, alpha = 1) {
  freq_table <- freq_table %>%
    mutate(Smoothed_Prob = (freq + alpha) / (sum(freq) + alpha * vocab_size))
  return(freq_table)
}

Apply smoothing

vocab_size <- length(unigram_freq$ngrams)
bigram_freq <- apply_laplace_smoothing(bigram_freq, vocab_size)
trigram_freq <- apply_laplace_smoothing(trigram_freq, vocab_size)

Test input

input_text <- "several"
next_word <- predict_next_word(input_text, unigram_freq, bigram_freq, trigram_freq)
cat("Predicted next word:", next_word, "\n")

## Predicted next word: the

Exploratory data analysis and Modelling

2024-12-13

Defining the file path

Reading the file

Clean the text data

Tokenize into words

Calculate word frequencies

Generating 2-grams

Generating 3-grams

Visualize word frequency distribution

To make prediction taking small sample to reduce the processing time.

Continue with cleaning and n-gram generation on the sampled text

Generate unigrams

Generate bigrams

Generate trigrams

Prediction function

Apply smoothing

Test input