607 Lab 10 sentiment analysis

## Warning: package 'tidytext' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

get_sentiments("bing") %>% 
  group_by(sentiment) %>% 
  count(sentiment)

## # A tibble: 2 × 2
## # Groups:   sentiment [2]
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

url<- "https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/sentimentdataset.csv"

tweet<- read.csv(url) %>% 
  select(c(Year, Text))

tok_tweet<- tweet %>% 
  unnest_tokens(word, Text)

data("stop_words")
clean_tweet<- tok_tweet %>% 
  anti_join(stop_words)

## Joining with `by = join_by(word)`

clean_tweet %>% 
  count(word, sort = TRUE) %>% 
  slice_max(n, n = 10) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(n, word)) +
  geom_col() +
  labs(title = "Top ten frequently used tweeter words")

aff_score<- clean_tweet %>% 
  inner_join(get_sentiments('afinn'), by = "word") %>% 
  group_by(Year) %>% 
  summarise(sentiment=sum(value))



ggplot(aff_score, aes(x=Year, y=sentiment))+
  geom_bar(stat='identity')+
  scale_x_continuous(breaks = seq(2010, 2023, 1))+
  labs(title = 'Overall tweeter sentiment by years')

607 Lab 10 sentiment analysis

Chi Hang(Philip) Cheung

2024-11-11