##My dataset Data is just The Project Gutenberg eBook of The heel of Achilles in txt format. I fromatted it into n-gramm dataframe and show below “head(ngram_dataset,10)”

# Указываем путь к файлу с текстом
file_path <- "/cloud/project/pg73352.txt"

# Читаем текстовый файл
text <- tolower(readLines(file_path, warn = FALSE))

# Объединяем текст в одну строку
text <- paste(text, collapse = " ")

# Удаляем лишние пробелы и пунктуацию
text <- gsub("[[:punct:]]", "", text)
text <- gsub("\\s+", " ", text)

# Разделяем текст на слова
words <- strsplit(text, " ")[[1]]

# Создаем датасет для n-грамм
ngram_dataset <- data.frame()

# Формируем n-граммы
n <- 3  # Указываем порядок n-граммы
for (i in 1:(length(words) - n + 1)) {
  ngram <- paste(words[i:(i + n - 1)], collapse = " ")
  next_word <- ifelse(i < (length(words) - n + 1), words[i + n], NA)
  ngram_dataset <- rbind(ngram_dataset, c(ngram, next_word))
}

# Называем столбцы
names(ngram_dataset) <- c(paste0("word"), "next_word")

# Выводим первые строки датасета
head(ngram_dataset,10)
##                         word    next_word
## 1                    i am an       orphan
## 2               am an orphan    reflected
## 3        an orphan reflected        lydia
## 4     orphan reflected lydia      raymond
## 5    reflected lydia raymond         with
## 6         lydia raymond with      immense
## 7       raymond with immense satisfaction
## 8  with immense satisfaction          she
## 9   immense satisfaction she          was
## 10      satisfaction she was            a

Introduction

The goal of this project is to analyze the ngram_dataset and summarize the major features of the data, as well as outline plans for creating a next word prediction algorithm and Shiny app.

Summary Statistics

Let’s start by examining some summary statistics of the ngram_dataset:

summary(ngram_dataset)
##      word            next_word        
##  Length:55717       Length:55717      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

##Analysis of Data ###Frequency of Words We can calculate the frequency of words in the word column:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
word_freq <- ngram_dataset %>%
  group_by(word) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
head(word_freq, 10)
## # A tibble: 10 × 2
##    word                         count
##    <chr>                        <int>
##  1 the project gutenberg           30
##  2 said aunt beryl                 27
##  3 that she had                    21
##  4 aunt beryl and                  20
##  5 and uncle george                18
##  6 mr monteagle almond             18
##  7 of her own                      18
##  8 project gutenberg electronic    18
##  9 i dont know                     16
## 10 she did not                     16
library(ggplot2)


# Вывод графика
ggplot(head(word_freq, 15), aes(x = reorder(word, -count), y = count)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 10 Most Frequent Words",
       x = "Word",
       y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Most Common Next Words Let’s also identify the most common next words in the next_word column:

next_word_freq <- ngram_dataset %>%
  group_by(next_word) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
head(next_word_freq, 10)
## # A tibble: 10 × 2
##    next_word count
##    <chr>     <int>
##  1 the        2469
##  2 and        1596
##  3 to         1573
##  4 of         1557
##  5 a          1159
##  6 her        1122
##  7 she         891
##  8 in          825
##  9 was         761
## 10 that        756

##Plans for Prediction Algorithm and Shiny App As we can see data еhe data needs further preparation, removing stop words and often repeatable articles and pronouns. The next step will be to use the ngram_dataset to train a prediction algorithm. We will develop an n-gram model to predict the next word in a sequence of words. Additionally, we plan to create a Shiny app interface for users to interact with the prediction algorithm.

##Conclusion This document provides a summary of the ngram_dataset and outlines plans for creating a next word prediction algorithm and Shiny app.