file_path <- "/cloud/project/1661-0.txt"
text <- tolower(readLines(file_path))
text <- paste(text, collapse = " ")
text <- gsub("[[:punct:]]", "", text)
text <- gsub("\\s+", " ", text)

words <- strsplit(text, " ")[[1]]
n <- 3
my_dataset <- data.frame(matrix(nrow = length(words) - n + 1, ncol = n + 1))
colnames(my_dataset) <- c("word1","word2","word3", "word4")

for (i in 1:(length(words) - n + 1)) {
  my_dataset[i, 1:n] <- words[i:(i + n - 1)]
  my_dataset[i, n + 1] <- ifelse(i < (length(words) - n + 1), words[i + n], NA)
}

head(my_dataset,30)
##           word1        word2        word3        word4
## 1       project   gutenbergs          the   adventures
## 2    gutenbergs          the   adventures           of
## 3           the   adventures           of     sherlock
## 4    adventures           of     sherlock       holmes
## 5            of     sherlock       holmes           by
## 6      sherlock       holmes           by       arthur
## 7        holmes           by       arthur        conan
## 8            by       arthur        conan        doyle
## 9        arthur        conan        doyle         this
## 10        conan        doyle         this        ebook
## 11        doyle         this        ebook           is
## 12         this        ebook           is          for
## 13        ebook           is          for          the
## 14           is          for          the          use
## 15          for          the          use           of
## 16          the          use           of       anyone
## 17          use           of       anyone     anywhere
## 18           of       anyone     anywhere           at
## 19       anyone     anywhere           at           no
## 20     anywhere           at           no         cost
## 21           at           no         cost          and
## 22           no         cost          and         with
## 23         cost          and         with       almost
## 24          and         with       almost           no
## 25         with       almost           no restrictions
## 26       almost           no restrictions   whatsoever
## 27           no restrictions   whatsoever          you
## 28 restrictions   whatsoever          you          may
## 29   whatsoever          you          may         copy
## 30          you          may         copy           it

Introduction

The purpose of this report is to analyze the ngram_dataset and provide insights into the progress made towards creating a Next Word Prediction algorithm. This document will be concise, focusing on key characteristics of the dataset and outlining plans for developing the prediction algorithm and Shiny app.

Summary Statistics

Let’s begin by examining some summary statistics of the ngram_dataset:

summary(my_dataset)
##     word1              word2              word3              word4          
##  Length:107545      Length:107545      Length:107545      Length:107545     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character

##Analysis of Data ###Frequency of Word Combinations We can calculate the frequency of word combinations in the dataset to identify common patterns:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
my_dataset$phrase <- paste(my_dataset$word1, my_dataset$word2, my_dataset$word3, my_dataset$word4, sep = " ")

phrase_freq <- my_dataset %>%
  group_by(phrase) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

head(phrase_freq, 10)
## # A tibble: 10 × 2
##    phrase                                count
##    <chr>                                 <int>
##  1 i have no doubt                          17
##  2 i think that i                           17
##  3 have no doubt that                       14
##  4 the adventure of the                     14
##  5 gutenberg literary archive foundation    13
##  6 i do not know                            13
##  7 project gutenberg literary archive       13
##  8 the project gutenberg literary           13
##  9 project gutenbergtm electronic works     12
## 10 it seemed to me                          11

##Visualization of Word Combinations Let’s visualize the top 10 most frequent word combinations:

library(ggplot2)

# Plot top 10 most frequent word combinations
ggplot(head(phrase_freq, 20), aes(x = reorder(phrase, -count), y = count)) +
  geom_bar(stat = "identity", fill = "red") +
  labs(title = "Top 20 Most Frequent Word Combinations",
       x = "Word Combination",
       y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Plans for Prediction Algorithm and Shiny App The next step will be to use the data to train a prediction algorithm. To predict the next word based on previous word combinations.