Introduction

The goal of this project is to build a predictive text model using natural language processing techniques. This report summarizes the initial exploration of the dataset and outlines plans for building the final prediction algorithm and Shiny application. The analysis is presented in a simple and concise manner for a non-technical audience.

Data Description

The dataset consists of three English text files:

These files contain large amounts of natural language text collected from different sources. The data was loaded into R and processed for analysis.

Loading Required Libraries

library(stringi)
library(tm)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
# Set file paths (update as needed)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Line counts
blog_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)

# Word counts
blog_words <- sum(stri_count_words(blogs))
news_words <- sum(stri_count_words(news))
twitter_words <- sum(stri_count_words(twitter))

# Create summary table
summary_table <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  Lines = c(blog_lines, news_lines, twitter_lines),
  Words = c(blog_words, news_words, twitter_words)
)

summary_table
##      File   Lines    Words
## 1   Blogs  899288 37546806
## 2    News 1010206 34761151
## 3 Twitter 2360148 30096690
set.seed(123)

sample_data <- c(
  sample(blogs, length(blogs) * 0.01),
  sample(news, length(news) * 0.01),
  sample(twitter, length(twitter) * 0.01)
)
# Create corpus
corpus <- Corpus(VectorSource(sample_data))

# Clean text
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
library(slam)

tdm <- TermDocumentMatrix(corpus)

word_freq <- sort(row_sums(tdm), decreasing = TRUE)

freq_df <- data.frame(
  word = names(word_freq),
  freq = word_freq

)
head(freq_df, 10)
##      word freq
## will will 3154
## said said 3076
## just just 3066
## one   one 2933
## like like 2640
## can   can 2426
## get   get 2285
## time time 2083
## new   new 1929
## now   now 1763
ggplot(freq_df[1:1000, ], aes(x = freq)) +
  geom_histogram(bins = 50) +
  labs(title = "Histogram of Word Frequencies",
       x = "Frequency",
       y = "Count")

top_words <- freq_df[1:20, ]

ggplot(top_words, aes(x = reorder(word, freq), y = freq)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words",
       x = "Words",
       y = "Frequency")

wordcloud(words = freq_df$word,
          freq = freq_df$freq,
          min.freq = 50,
          max.words = 100,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))