library(tidyverse)
library(stringr)
library(quanteda)
library(quanteda.textplots)
library(DT)
Load News Story
rstudioapi::getActiveDocumentContext()
## Document Context:
## - id: 'D00E90B8'
## - path: 'C:/Users/mvx13/OneDrive - Texas State University/00_AIT_Lab/NSTI_Workshop/STC_2025_inClass.Rmd'
## - contents: <117 rows>
## Document Selection:
## - [12, 35] -- [12, 35]: ''
# Load CSV (replace with your path or file name)
dat <- read.csv("random_news_stories.csv", stringsAsFactors = FALSE)
# Show structure and preview
dim(dat)
## [1] 30 3
library(DT)
datatable(
dat,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('csv'),
pageLength = 10
),
rownames = FALSE
)
Clean and Preprocess
Data
# Define cleaning function
clean_texts <- function(x) {
x %>%
str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
str_replace_all("&", "and") %>%
str_remove_all("[[:punct:]]") %>%
str_remove_all("^RT:? ") %>%
str_remove_all("@[[:alnum:]]+") %>%
str_remove_all("#[[:alnum:]]+") %>%
str_remove_all("pictwittercom") %>%
str_replace_all("\\\\n", " ") %>%
str_to_lower() %>%
str_trim()
}
# Apply to NewsStory column
dat$clean_story <- clean_texts(dat$NewsStory)
Token
Development
# Create a corpus from cleaned text
corp <- corpus(dat$clean_story)
# Combine domain-specific and general stopwords
custom_stops <- c("crash", "vehicle", "driver", "car", "mph", "kmh",
letters, "westbound", "eastbound", "northbound", "southbound",
"na", stopwords("en"))
# Tokenize and clean
dfm_clean <- corp %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_remove(pattern = custom_stops) %>%
dfm()
Frequent Word
# Show top 30 terms
top_terms <- topfeatures(dfm_clean, 30)
barplot(top_terms, las = 2, col = "steelblue", main = "Top 30 Frequent Words")

Word Cooccurence
Network
# Select most frequent terms
top_words <- names(top_terms)
fcm_obj <- fcm(dfm_clean)
fcm_top <- fcm_select(fcm_obj, pattern = top_words)
# Network plot
textplot_network(fcm_top, min_freq = 0.1, edge_alpha = 0.4,
edge_color = "#798E87", edge_size = 2,
vertex_labelsize = 8)
