The goal of this project is to build a predictive text model using natural language processing techniques. This report summarizes the initial exploration of the dataset and outlines plans for building the final prediction algorithm and Shiny application. The analysis is presented in a simple and concise manner for a non-technical audience.
The dataset consists of three English text files:
These files contain large amounts of natural language text collected from different sources. The data was loaded into R and processed for analysis.
library(stringi)
library(tm)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
# Set file paths (update as needed)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Line counts
blog_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
# Word counts
blog_words <- sum(stri_count_words(blogs))
news_words <- sum(stri_count_words(news))
twitter_words <- sum(stri_count_words(twitter))
# Create summary table
summary_table <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(blog_lines, news_lines, twitter_lines),
Words = c(blog_words, news_words, twitter_words)
)
summary_table
## File Lines Words
## 1 Blogs 899288 37546806
## 2 News 1010206 34761151
## 3 Twitter 2360148 30096690
set.seed(123)
sample_data <- c(
sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01)
)
# Create corpus
corpus <- Corpus(VectorSource(sample_data))
# Clean text
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
library(slam)
tdm <- TermDocumentMatrix(corpus)
word_freq <- sort(row_sums(tdm), decreasing = TRUE)
freq_df <- data.frame(
word = names(word_freq),
freq = word_freq
)
head(freq_df, 10)
## word freq
## will will 3154
## said said 3076
## just just 3066
## one one 2933
## like like 2640
## can can 2426
## get get 2285
## time time 2083
## new new 1929
## now now 1763
ggplot(freq_df[1:1000, ], aes(x = freq)) +
geom_histogram(bins = 50) +
labs(title = "Histogram of Word Frequencies",
x = "Frequency",
y = "Count")
top_words <- freq_df[1:20, ]
ggplot(top_words, aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Words",
y = "Frequency")
wordcloud(words = freq_df$word,
freq = freq_df$freq,
min.freq = 50,
max.words = 100,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"))