library(tidyverse)
library(stringr)
library(quanteda)
library(quanteda.textplots)
library(DT)

0.1 Load News Story

rstudioapi::getActiveDocumentContext()
## Document Context: 
## - id:        'D00E90B8'
## - path:      'C:/Users/mvx13/OneDrive - Texas State University/00_AIT_Lab/NSTI_Workshop/STC_2025_inClass.Rmd'
## - contents:  <117 rows>
## Document Selection:
## - [12, 35] -- [12, 35]: ''
# Load CSV (replace with your path or file name)
dat <- read.csv("random_news_stories.csv", stringsAsFactors = FALSE)

# Show structure and preview
dim(dat)
## [1] 30  3
library(DT)

datatable(
  dat, 
  extensions = 'Buttons',
  options = list(
    dom = 'Bfrtip',
    buttons = c('csv'),
    pageLength = 10
  ),
  rownames = FALSE
)

0.2 Clean and Preprocess Data

# Define cleaning function
clean_texts <- function(x) {
  x %>%
    str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
    str_replace_all("&amp;", "and") %>%
    str_remove_all("[[:punct:]]") %>%
    str_remove_all("^RT:? ") %>%
    str_remove_all("@[[:alnum:]]+") %>%
    str_remove_all("#[[:alnum:]]+") %>%
    str_remove_all("pictwittercom") %>%
    str_replace_all("\\\\n", " ") %>%
    str_to_lower() %>%
    str_trim()
}

# Apply to NewsStory column
dat$clean_story <- clean_texts(dat$NewsStory)

0.3 Token Development

# Create a corpus from cleaned text
corp <- corpus(dat$clean_story)

# Combine domain-specific and general stopwords
custom_stops <- c("crash", "vehicle", "driver", "car", "mph", "kmh", 
                  letters, "westbound", "eastbound", "northbound", "southbound", 
                  "na", stopwords("en"))

# Tokenize and clean
dfm_clean <- corp %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE) %>%
  tokens_remove(pattern = custom_stops) %>%
  dfm()

0.4 Frequent Word

# Show top 30 terms
top_terms <- topfeatures(dfm_clean, 30)
barplot(top_terms, las = 2, col = "steelblue", main = "Top 30 Frequent Words")

0.5 Word Cooccurence Network

# Select most frequent terms
top_words <- names(top_terms)
fcm_obj <- fcm(dfm_clean)
fcm_top <- fcm_select(fcm_obj, pattern = top_words)

# Network plot
textplot_network(fcm_top, min_freq = 0.1, edge_alpha = 0.4,
                 edge_color = "#798E87", edge_size = 2,
                 vertex_labelsize = 8)