# Load necessary libraries
library(htmltools)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(sentimentr)
library(ggplot2)
library(tidytext)
library(textdata)
library(RColorBrewer) # Added for brewer.pal
# Read the data
reddit_data <- read.csv("wallstreetbets2.csv", stringsAsFactors = FALSE)
# Convert date_utc to proper date format
reddit_data$date <- as_datetime(reddit_data$date_utc)
# Basic summary statistics
summary(reddit_data$comments)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 19.0 47.0 694.7 130.0 21718.0
# Time series analysis - posts per day
daily_posts <- reddit_data %>%
mutate(date = as.Date(date)) %>%
count(date)
# Analyze comment engagement
ggplot(reddit_data, aes(x = comments)) +
geom_histogram(bins = 30, fill = "steelblue") +
labs(title = "Distribution of Comments on Reddit Posts",
x = "Number of Comments",
y = "Count") +
theme_minimal()

# Find most engaging posts (most comments)
top_posts <- reddit_data %>%
arrange(desc(comments)) %>%
select(date, title, comments) %>%
head(10)
print(top_posts)
## date title
## 1 2025-06-20 Weekend Discussion Thread for the Weekend of June 20, 2025
## 2 2025-06-05 Daily Discussion Thread for June 05, 2025
## 3 2025-06-13 Weekend Discussion Thread for the Weekend of June 13, 2025
## 4 2025-06-13 Daily Discussion Thread for June 13, 2025
## 5 2025-06-12 What Are Your Moves Tomorrow, June 13, 2025
## 6 2025-06-11 Daily Discussion Thread for June 11, 2025
## 7 2025-06-27 Daily Discussion Thread for June 27, 2025
## 8 2025-06-23 Daily Discussion Thread for June 23, 2025
## 9 2025-06-06 Daily Discussion Thread for June 06, 2025
## 10 2025-06-05 What Are Your Moves Tomorrow, June 06, 2025
## comments
## 1 21718
## 2 18369
## 3 17491
## 4 14842
## 5 14771
## 6 14725
## 7 14219
## 8 13844
## 9 13264
## 10 13176
# Text analysis - create a corpus from post titles
corpus_titles <- Corpus(VectorSource(reddit_data$title))
corpus_titles <- corpus_titles %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
# Create document-term matrix for titles
dtm_titles <- DocumentTermMatrix(corpus_titles)
freq_titles <- colSums(as.matrix(dtm_titles))
word_freq_titles <- data.frame(word = names(freq_titles), freq = freq_titles)
# Plot most common words in titles
word_freq_titles %>%
arrange(desc(freq)) %>%
head(20) %>%
ggplot(aes(x = reorder(word, freq), y = freq)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Most Common Words in Reddit Post Titles",
x = "Word",
y = "Frequency") +
theme_minimal()

# Text analysis - create a corpus from post content (if 'text' column exists)
if("text" %in% colnames(reddit_data)) {
corpus_text <- Corpus(VectorSource(reddit_data$text))
corpus_text <- corpus_text %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace)
# Create document-term matrix for text content
dtm_text <- DocumentTermMatrix(corpus_text)
freq_text <- colSums(as.matrix(dtm_text))
word_freq_text <- data.frame(word = names(freq_text), freq = freq_text)
# Plot most common words in post content
word_freq_text %>%
arrange(desc(freq)) %>%
head(30) %>%
ggplot(aes(x = reorder(word, freq), y = freq)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Most Common Words in Reddit Post Content", # Fixed title
x = "Word",
y = "Frequency") +
theme_minimal()
# Use text content for word cloud and sentiment analysis
word_freq_for_analysis <- word_freq_text
} else {
# Use titles if text column doesn't exist
word_freq_for_analysis <- word_freq_titles
}
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
# Generate word cloud
set.seed(123)
wordcloud(words = word_freq_for_analysis$word,
freq = word_freq_for_analysis$freq,
max.words = 100,
colors = brewer.pal(8, "Dark2"))

# Create the interactive word cloud
wordcloud2(data = word_freq_for_analysis,
size = 1,
color = "random-dark",
backgroundColor = "white",
shape = "circle",
rotateRatio = 0.3)