Scraping twitter data using Twitter API and performing sentiment Analysis

Blog post 5 describing the use of Twitter API to scrape data and perform sentiment analysis using dictionary methods as a part of the course “Text as Data”

Rahul Gundeti (Graduate student, Data Analytics & Computational Social Sciences (DACSS), UMass Amherst.)
2022-05-03

Loading required libraries

Accessing Twitter API tokens

Scraping Timeline and retweets of HONY twitter handle

t_hony <- get_timeline("humansofny", n = 3200, retweets =T )

Printing Tweets

print(t_hony)
# A tibble: 3,199 x 90
   user_id   status_id    created_at          screen_name text  source
   <chr>     <chr>        <dttm>              <chr>       <chr> <chr> 
 1 237548529 15185585768~ 2022-04-25 11:52:32 humansofny  "Our~ Twitt~
 2 237548529 15158700641~ 2022-04-18 01:49:21 humansofny  "@TE~ Twitt~
 3 237548529 15085431545~ 2022-03-28 20:34:50 humansofny  "(4/~ Twitt~
 4 237548529 15085216216~ 2022-03-28 19:09:16 humansofny  "(3/~ Twitt~
 5 237548529 15085038151~ 2022-03-28 17:58:30 humansofny  "(2/~ Twitt~
 6 237548529 15084785526~ 2022-03-28 16:18:07 humansofny  "(1/~ Twitt~
 7 237548529 15026978255~ 2022-03-12 17:27:35 humansofny  "@mk~ Twitt~
 8 237548529 15026619784~ 2022-03-12 15:05:08 humansofny  "@cr~ Twitt~
 9 237548529 14995731454~ 2022-03-04 02:31:13 humansofny  "(13~ Twitt~
10 237548529 14995592910~ 2022-03-04 01:36:10 humansofny  "(12~ Twitt~
# ... with 3,189 more rows, and 84 more variables:
#   display_text_width <dbl>, reply_to_status_id <chr>,
#   reply_to_user_id <chr>, reply_to_screen_name <chr>,
#   is_quote <lgl>, is_retweet <lgl>, favorite_count <int>,
#   retweet_count <int>, quote_count <int>, reply_count <int>,
#   hashtags <list>, symbols <list>, urls_url <list>,
#   urls_t.co <list>, urls_expanded_url <list>, media_url <list>, ...

Preprocessing & Tokenization

# We need to restructure lego as one-token-per-row format
tidy_tweets <- t_hony %>% # pipe data frame 
    filter(is_retweet==TRUE)%>% # only include original tweets
  select(status_id, 
         text)%>% # select variables of interest
  unnest_tokens(word, text) # splits column in one token per row format
tidy_tweets
# A tibble: 3,195 x 2
   status_id           word        
   <chr>               <chr>       
 1 1518558576847077378 our         
 2 1518558576847077378 radio       
 3 1518558576847077378 podcast     
 4 1518558576847077378 host        
 5 1518558576847077378 chionwolf   
 6 1518558576847077378 will        
 7 1518558576847077378 be          
 8 1518558576847077378 sharing     
 9 1518558576847077378 a           
10 1518558576847077378 conversation
# ... with 3,185 more rows

Calling stopwords

stop_words
# A tibble: 1,149 x 2
   word        lexicon
   <chr>       <chr>  
 1 a           SMART  
 2 a's         SMART  
 3 able        SMART  
 4 about       SMART  
 5 above       SMART  
 6 according   SMART  
 7 accordingly SMART  
 8 across      SMART  
 9 actually    SMART  
10 after       SMART  
# ... with 1,139 more rows

Creating a DataFrame

my_stop_words <- tibble( #construct a dataframe
  word = c(
    "https",
    "t.co",
    "rt",
    "amp",
    "rstats",
    "gt"
  ),
  lexicon = "twitter"
)

Connecting stopwords to DF

# Connect stop words
all_stop_words <- stop_words %>%
  bind_rows(my_stop_words) # here we are connecting two data frames

# Let's see if it worked
view(all_stop_words)

# Remove numbers
tidy_tweets <- tidy_tweets %>%
    filter(is.na(as.numeric(word))) # remember filter() returns rows where conditions are true

Converting to vector

tidytweetsText  = as.vector(tidy_tweets$text)
head(tidy_tweets,5)
# A tibble: 5 x 2
  status_id           word     
  <chr>               <chr>    
1 1518558576847077378 our      
2 1518558576847077378 radio    
3 1518558576847077378 podcast  
4 1518558576847077378 host     
5 1518558576847077378 chionwolf

Removing stopwords

tweets_final <- tidy_tweets %>%
  anti_join(all_stop_words, by = "word")

Sentiment Analysis of tweets using NRC

nrc <- get_sentiments("nrc")%>%  # get specific sentiment lexicons in a tidy format
    filter(sentiment %in% c("positive", "negative", "anger", "sadness", "trust", "fear", "disgust", "joy", "surprise")) 

view(nrc)

nrc_words <- tweets_final %>%
  inner_join(nrc, by="word")

view(nrc_words)




pie_words<- nrc_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")
# NRC: 
pie_words %>% group_by(sentiment) %>%
    summarize(total=sum(n)) %>%
    spread(sentiment, total) %>%
    mutate((net.sentiment=(positive+joy+surprise+trust)-(negative+anger+disgust+fear+sadness))) %>% 
    kable(align = 'r')
anger disgust fear joy negative positive sadness surprise trust (…)
23 10 32 80 43 150 27 33 84 212

Sentiment Analysis of Tweets using BING

bing <- get_sentiments("bing")%>%
  count(word, sentiment, sort=T)
bing
# A tibble: 6,786 x 3
   word        sentiment     n
   <chr>       <chr>     <int>
 1 2-faces     negative      1
 2 abnormal    negative      1
 3 abolish     negative      1
 4 abominable  negative      1
 5 abominably  negative      1
 6 abominate   negative      1
 7 abomination negative      1
 8 abort       negative      1
 9 aborted     negative      1
10 aborts      negative      1
# ... with 6,776 more rows
bing_words <- tweets_final %>%
  inner_join(bing, by="word")

view(bing_words)


pie_words<- bing_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")
 bing_words %>% group_by(sentiment) %>% 
    summarize(total=sum(n)) %>%
    spread(sentiment, total) %>%
    mutate((net.sentiment=positive-negative)) %>% 
    kable(align = 'l')
negative positive (net.sentiment = positive - negative)
37 95 58

Sentiment Analysis of tweets using AFINN

afinn_df <- tweets_final %>% inner_join(get_sentiments("afinn")) %>%
    mutate(sentiment = case_when(value < 0 ~ 'negative', 
                                 value > 0 ~ 'positive'))

view(afinn_df)


pie_words <- afinn_df %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency



ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

Observations:

All the 3 dictionary method outputs show that the positive sentiment is dominating the negative sentiment of the readers who use twitter. The HONY has a good/ positive influence on the readers with the stories it bring to it’s audience.