file_path <- "/cloud/project/1661-0.txt"
text <- tolower(readLines(file_path))
text <- paste(text, collapse = " ")
text <- gsub("[[:punct:]]", "", text)
text <- gsub("\\s+", " ", text)
words <- strsplit(text, " ")[[1]]
n <- 3
my_dataset <- data.frame(matrix(nrow = length(words) - n + 1, ncol = n + 1))
colnames(my_dataset) <- c("word1","word2","word3", "word4")
for (i in 1:(length(words) - n + 1)) {
my_dataset[i, 1:n] <- words[i:(i + n - 1)]
my_dataset[i, n + 1] <- ifelse(i < (length(words) - n + 1), words[i + n], NA)
}
head(my_dataset,30)
## word1 word2 word3 word4
## 1 project gutenbergs the adventures
## 2 gutenbergs the adventures of
## 3 the adventures of sherlock
## 4 adventures of sherlock holmes
## 5 of sherlock holmes by
## 6 sherlock holmes by arthur
## 7 holmes by arthur conan
## 8 by arthur conan doyle
## 9 arthur conan doyle this
## 10 conan doyle this ebook
## 11 doyle this ebook is
## 12 this ebook is for
## 13 ebook is for the
## 14 is for the use
## 15 for the use of
## 16 the use of anyone
## 17 use of anyone anywhere
## 18 of anyone anywhere at
## 19 anyone anywhere at no
## 20 anywhere at no cost
## 21 at no cost and
## 22 no cost and with
## 23 cost and with almost
## 24 and with almost no
## 25 with almost no restrictions
## 26 almost no restrictions whatsoever
## 27 no restrictions whatsoever you
## 28 restrictions whatsoever you may
## 29 whatsoever you may copy
## 30 you may copy it
The purpose of this report is to analyze the
ngram_dataset and provide insights into the progress made
towards creating a Next Word Prediction algorithm. This document will be
concise, focusing on key characteristics of the dataset and outlining
plans for developing the prediction algorithm and Shiny app.
Let’s begin by examining some summary statistics of the
ngram_dataset:
summary(my_dataset)
## word1 word2 word3 word4
## Length:107545 Length:107545 Length:107545 Length:107545
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##Analysis of Data ###Frequency of Word Combinations We can calculate the frequency of word combinations in the dataset to identify common patterns:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
my_dataset$phrase <- paste(my_dataset$word1, my_dataset$word2, my_dataset$word3, my_dataset$word4, sep = " ")
phrase_freq <- my_dataset %>%
group_by(phrase) %>%
summarise(count = n()) %>%
arrange(desc(count))
head(phrase_freq, 10)
## # A tibble: 10 × 2
## phrase count
## <chr> <int>
## 1 i have no doubt 17
## 2 i think that i 17
## 3 have no doubt that 14
## 4 the adventure of the 14
## 5 gutenberg literary archive foundation 13
## 6 i do not know 13
## 7 project gutenberg literary archive 13
## 8 the project gutenberg literary 13
## 9 project gutenbergtm electronic works 12
## 10 it seemed to me 11
##Visualization of Word Combinations Let’s visualize the top 10 most frequent word combinations:
library(ggplot2)
# Plot top 10 most frequent word combinations
ggplot(head(phrase_freq, 20), aes(x = reorder(phrase, -count), y = count)) +
geom_bar(stat = "identity", fill = "red") +
labs(title = "Top 20 Most Frequent Word Combinations",
x = "Word Combination",
y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Plans for Prediction Algorithm and Shiny App The next step will be to use the data to train a prediction algorithm. To predict the next word based on previous word combinations.