knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(stringi)
library(ggplot2)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
Introduction This report is part of the Coursera Data Science Capstone project. The goal is to demonstrate that we have downloaded and loaded the data, performed basic exploratory analysis, and laid the groundwork for building a text prediction algorithm and Shiny app. This report is written in a concise way for a non-technical audience.
Data Loading and Summary The dataset is from the HC Corpora and contains three English text files: blogs, news, and Twitter.
# Read the Twitter dataset
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Read the Blogs dataset
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
# Read the News dataset
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
Basic Stats
data_summary <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
FileSizeMB = c(file.info("final/en_US/en_US.blogs.txt")$size,
file.info("final/en_US/en_US.news.txt")$size,
file.info("final/en_US/en_US.twitter.txt")$size) / 1024^2
)
# Print results
data_summary
## Dataset Lines Words FileSizeMB
## 1 Blogs 899288 37546250 200.4242
## 2 News 77259 2674536 196.2753
## 3 Twitter 2360148 30093413 159.3641
set.seed(1234)
sample_size <- 5000
sample_data <- c(
sample(blogs, sample_size),
sample(news, sample_size),
sample(twitter, sample_size)
)
corpus <- tibble(text = sample_data)
Cleaning and Tokenizing
cleaned <- corpus %>%
mutate(text = str_replace_all(text, "[^[:alpha:]\\s]", "")) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word,
str_detect(word, "^[a-z]+$"))
Top Words
top_words <- cleaned %>%
count(word, sort = TRUE) %>%
top_n(20)
ggplot(top_words, aes(reorder(word, n), n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words", x = "Words", y = "Frequency")
Word Cloud
set.seed(123)
wordcloud(words = top_words$word, freq = top_words$n, max.words = 100, colors = brewer.pal(8, "Dark2"))
bigrams <- corpus %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
trigrams <- corpus %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
bigram_top <- bigrams %>% top_n(20)
trigram_top <- trigrams %>% top_n(20)
p1 <- ggplot(bigram_top, aes(reorder(bigram, n), n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Bigrams", x = "", y = "")
p2 <- ggplot(trigram_top, aes(reorder(trigram, n), n)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(title = "Top 20 Trigrams", x = "", y = "")
grid.arrange(p1, p2, ncol = 2)
Planned Approach: Create frequency tables for unigrams, bigrams, trigrams
Use backoff to predict next word (e.g., if no trigram match, fall back to bigram, then unigram)
Apply smoothing techniques to handle unseen word combinations
Optimize the model to be small enough for Shiny deployment
Shiny App Goals: User types a phrase
App predicts and displays the next likely word
Optionally suggest top 3 predicted words