This report presents the preliminary phases of my Capstone Project for Module 2. The ultimate objective of this endeavor is to develop a text prediction algorithm and a corresponding Shiny application. This document focuses on the foundational tasks that set the stage for more complex analyses:
We need to ensure all necessary packages are loaded and the environment is correctly set up.
library(dplyr)
library(tidytext)
library(ggplot2)
setwd("~/Desktop/R Programming Course/Milestone Project") # Set the working directory
# Sample 1000 lines from each dataset
sample_blog_lines <- sample(blog_lines, 1000)
sample_twitter_lines <- sample(twitter_lines, 1000)
sample_news_lines <- sample(news_lines, 1000)
# Tokenizing samples
tokenized_blog <- tibble(text = sample_blog_lines) %>% unnest_tokens(word, text)
tokenized_twitter <- tibble(text = sample_twitter_lines) %>% unnest_tokens(word, text)
tokenized_news <- tibble(text = sample_news_lines) %>% unnest_tokens(word, text)
# Create frequency data for blogs
blog_freq <- tokenized_blog %>% count(word, sort = TRUE) %>% top_n(25)
## Selecting by n
# Create frequency data for Twitter
twitter_freq <- tokenized_twitter %>% count(word, sort = TRUE) %>% top_n(25)
## Selecting by n
# Create frequency data for news
news_freq <- tokenized_news %>% count(word, sort = TRUE) %>% top_n(25)
## Selecting by n
# Plotting
ggplot(blog_freq, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "orange") +
coord_flip() +
labs(title = "Top 25 Word Frequencies - Blog Sample", x = "Words", y = "Frequency") +
theme_minimal()
ggplot(twitter_freq, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Top 25 Word Frequencies - Twitter Sample", x = "Words", y = "Frequency") +
theme_minimal()
ggplot(news_freq, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 25 Word Frequencies - News Sample", x = "Words", y = "Frequency") +
theme_minimal()