Scraping twitter data using Twitter API and performing sentiment Analysis

Loading required libraries

library(tm)
library(ggplot2)
library(tidytext)
library(tidyverse)
library(quanteda)
library(knitr)
library(rtweet) 
library(ggpubr)

Accessing Twitter API tokens

Scraping Timeline and retweets of HONY twitter handle

t_hony <- get_timeline("humansofny", n = 3200, retweets =T )

Printing Tweets

print(t_hony)

# A tibble: 3,199 x 90
   user_id   status_id    created_at          screen_name text  source
   <chr>     <chr>        <dttm>              <chr>       <chr> <chr> 
 1 237548529 15185585768~ 2022-04-25 11:52:32 humansofny  "Our~ Twitt~
 2 237548529 15158700641~ 2022-04-18 01:49:21 humansofny  "@TE~ Twitt~
 3 237548529 15085431545~ 2022-03-28 20:34:50 humansofny  "(4/~ Twitt~
 4 237548529 15085216216~ 2022-03-28 19:09:16 humansofny  "(3/~ Twitt~
 5 237548529 15085038151~ 2022-03-28 17:58:30 humansofny  "(2/~ Twitt~
 6 237548529 15084785526~ 2022-03-28 16:18:07 humansofny  "(1/~ Twitt~
 7 237548529 15026978255~ 2022-03-12 17:27:35 humansofny  "@mk~ Twitt~
 8 237548529 15026619784~ 2022-03-12 15:05:08 humansofny  "@cr~ Twitt~
 9 237548529 14995731454~ 2022-03-04 02:31:13 humansofny  "(13~ Twitt~
10 237548529 14995592910~ 2022-03-04 01:36:10 humansofny  "(12~ Twitt~
# ... with 3,189 more rows, and 84 more variables:
#   display_text_width <dbl>, reply_to_status_id <chr>,
#   reply_to_user_id <chr>, reply_to_screen_name <chr>,
#   is_quote <lgl>, is_retweet <lgl>, favorite_count <int>,
#   retweet_count <int>, quote_count <int>, reply_count <int>,
#   hashtags <list>, symbols <list>, urls_url <list>,
#   urls_t.co <list>, urls_expanded_url <list>, media_url <list>, ...

Preprocessing & Tokenization

# We need to restructure lego as one-token-per-row format
tidy_tweets <- t_hony %>% # pipe data frame 
    filter(is_retweet==TRUE)%>% # only include original tweets
  select(status_id, 
         text)%>% # select variables of interest
  unnest_tokens(word, text) # splits column in one token per row format
tidy_tweets

# A tibble: 3,195 x 2
   status_id           word        
   <chr>               <chr>       
 1 1518558576847077378 our         
 2 1518558576847077378 radio       
 3 1518558576847077378 podcast     
 4 1518558576847077378 host        
 5 1518558576847077378 chionwolf   
 6 1518558576847077378 will        
 7 1518558576847077378 be          
 8 1518558576847077378 sharing     
 9 1518558576847077378 a           
10 1518558576847077378 conversation
# ... with 3,185 more rows

Calling stopwords

stop_words

# A tibble: 1,149 x 2
   word        lexicon
   <chr>       <chr>  
 1 a           SMART  
 2 a's         SMART  
 3 able        SMART  
 4 about       SMART  
 5 above       SMART  
 6 according   SMART  
 7 accordingly SMART  
 8 across      SMART  
 9 actually    SMART  
10 after       SMART  
# ... with 1,139 more rows

Creating a DataFrame

my_stop_words <- tibble( #construct a dataframe
  word = c(
    "https",
    "t.co",
    "rt",
    "amp",
    "rstats",
    "gt"
  ),
  lexicon = "twitter"
)

Connecting stopwords to DF

# Connect stop words
all_stop_words <- stop_words %>%
  bind_rows(my_stop_words) # here we are connecting two data frames

# Let's see if it worked
view(all_stop_words)

# Remove numbers
tidy_tweets <- tidy_tweets %>%
    filter(is.na(as.numeric(word))) # remember filter() returns rows where conditions are true

Converting to vector

tidytweetsText  = as.vector(tidy_tweets$text)
head(tidy_tweets,5)

# A tibble: 5 x 2
  status_id           word     
  <chr>               <chr>    
1 1518558576847077378 our      
2 1518558576847077378 radio    
3 1518558576847077378 podcast  
4 1518558576847077378 host     
5 1518558576847077378 chionwolf

Removing stopwords

tweets_final <- tidy_tweets %>%
  anti_join(all_stop_words, by = "word")

Sentiment Analysis of tweets using NRC

nrc <- get_sentiments("nrc")%>%  # get specific sentiment lexicons in a tidy format
    filter(sentiment %in% c("positive", "negative", "anger", "sadness", "trust", "fear", "disgust", "joy", "surprise")) 

view(nrc)

nrc_words <- tweets_final %>%
  inner_join(nrc, by="word")

view(nrc_words)




pie_words<- nrc_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

# NRC: 
pie_words %>% group_by(sentiment) %>%
    summarize(total=sum(n)) %>%
    spread(sentiment, total) %>%
    mutate((net.sentiment=(positive+joy+surprise+trust)-(negative+anger+disgust+fear+sadness))) %>% 
    kable(align = 'r')

anger	disgust	fear	joy	negative	positive	sadness	surprise	trust	(…)
23	10	32	80	43	150	27	33	84	212

Sentiment Analysis of Tweets using BING

bing <- get_sentiments("bing")%>%
  count(word, sentiment, sort=T)
bing

# A tibble: 6,786 x 3
   word        sentiment     n
   <chr>       <chr>     <int>
 1 2-faces     negative      1
 2 abnormal    negative      1
 3 abolish     negative      1
 4 abominable  negative      1
 5 abominably  negative      1
 6 abominate   negative      1
 7 abomination negative      1
 8 abort       negative      1
 9 aborted     negative      1
10 aborts      negative      1
# ... with 6,776 more rows

bing_words <- tweets_final %>%
  inner_join(bing, by="word")

view(bing_words)


pie_words<- bing_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

 bing_words %>% group_by(sentiment) %>% 
    summarize(total=sum(n)) %>%
    spread(sentiment, total) %>%
    mutate((net.sentiment=positive-negative)) %>% 
    kable(align = 'l')

negative	positive	(net.sentiment = positive - negative)
37	95	58

Sentiment Analysis of tweets using AFINN

afinn_df <- tweets_final %>% inner_join(get_sentiments("afinn")) %>%
    mutate(sentiment = case_when(value < 0 ~ 'negative', 
                                 value > 0 ~ 'positive'))

view(afinn_df)


pie_words <- afinn_df %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

Observations:

All the 3 dictionary method outputs show that the positive sentiment is dominating the negative sentiment of the readers who use twitter. The HONY has a good/ positive influence on the readers with the stories it bring to it’s audience.