Analyzing stock-related posts scraped from Reddit.com

# Load necessary libraries
library(htmltools)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(wordcloud)

## Loading required package: RColorBrewer

library(wordcloud2)
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(sentimentr)
library(ggplot2)
library(tidytext)
library(textdata)
library(RColorBrewer)  # Added for brewer.pal

# Read the data
reddit_data <- read.csv("wallstreetbets2.csv", stringsAsFactors = FALSE)

# Convert date_utc to proper date format
reddit_data$date <- as_datetime(reddit_data$date_utc)

# Basic summary statistics
summary(reddit_data$comments)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    19.0    47.0   694.7   130.0 21718.0

# Time series analysis - posts per day
daily_posts <- reddit_data %>%
  mutate(date = as.Date(date)) %>%
  count(date)

# Analyze comment engagement
ggplot(reddit_data, aes(x = comments)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  labs(title = "Distribution of Comments on Reddit Posts",
       x = "Number of Comments",
       y = "Count") +
  theme_minimal()

# Find most engaging posts (most comments)
top_posts <- reddit_data %>%
  arrange(desc(comments)) %>%
  select(date, title, comments) %>%
  head(10)

print(top_posts)

##          date                                                      title
## 1  2025-06-20 Weekend Discussion Thread for the Weekend of June 20, 2025
## 2  2025-06-05                  Daily Discussion Thread for June 05, 2025
## 3  2025-06-13 Weekend Discussion Thread for the Weekend of June 13, 2025
## 4  2025-06-13                  Daily Discussion Thread for June 13, 2025
## 5  2025-06-12                What Are Your Moves Tomorrow, June 13, 2025
## 6  2025-06-11                  Daily Discussion Thread for June 11, 2025
## 7  2025-06-27                  Daily Discussion Thread for June 27, 2025
## 8  2025-06-23                  Daily Discussion Thread for June 23, 2025
## 9  2025-06-06                  Daily Discussion Thread for June 06, 2025
## 10 2025-06-05                What Are Your Moves Tomorrow, June 06, 2025
##    comments
## 1     21718
## 2     18369
## 3     17491
## 4     14842
## 5     14771
## 6     14725
## 7     14219
## 8     13844
## 9     13264
## 10    13176

# Text analysis - create a corpus from post titles
corpus_titles <- Corpus(VectorSource(reddit_data$title))
corpus_titles <- corpus_titles %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(tolower) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

# Create document-term matrix for titles
dtm_titles <- DocumentTermMatrix(corpus_titles)
freq_titles <- colSums(as.matrix(dtm_titles))
word_freq_titles <- data.frame(word = names(freq_titles), freq = freq_titles)

# Plot most common words in titles
word_freq_titles %>%
  arrange(desc(freq)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(word, freq), y = freq)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Most Common Words in Reddit Post Titles",
       x = "Word",
       y = "Frequency") +
  theme_minimal()

# Text analysis - create a corpus from post content (if 'text' column exists)
if("text" %in% colnames(reddit_data)) {
  corpus_text <- Corpus(VectorSource(reddit_data$text))
  corpus_text <- corpus_text %>%
    tm_map(removePunctuation) %>%
    tm_map(removeNumbers) %>%
    tm_map(tolower) %>%
    tm_map(removeWords, stopwords("english")) %>%
    tm_map(stripWhitespace)
  
  # Create document-term matrix for text content
  dtm_text <- DocumentTermMatrix(corpus_text)
  freq_text <- colSums(as.matrix(dtm_text))
  word_freq_text <- data.frame(word = names(freq_text), freq = freq_text)
  
  # Plot most common words in post content
  word_freq_text %>%
    arrange(desc(freq)) %>%
    head(30) %>%
    ggplot(aes(x = reorder(word, freq), y = freq)) +
    geom_col(fill = "darkgreen") +
    coord_flip() +
    labs(title = "Most Common Words in Reddit Post Content",  # Fixed title
         x = "Word",
         y = "Frequency") +
    theme_minimal()
  
  # Use text content for word cloud and sentiment analysis
  word_freq_for_analysis <- word_freq_text
} else {
  # Use titles if text column doesn't exist
  word_freq_for_analysis <- word_freq_titles
}

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

# Generate word cloud
set.seed(123)
wordcloud(words = word_freq_for_analysis$word, 
          freq = word_freq_for_analysis$freq, 
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

# Create the interactive word cloud
wordcloud2(data = word_freq_for_analysis, 
           size = 1,
           color = "random-dark",
           backgroundColor = "white",
           shape = "circle",
           rotateRatio = 0.3)

Analyzing stock-related posts scraped from Reddit.com

Dr. Zhenning ‘Jimmy’ Xu, follow me on Twitter: https://twitter.com/MKTJimmyxu or find me on my GitHub at https://github.com/utjimmyx

2025-06-29