Blog post 5 describing the use of Twitter API to scrape data and perform sentiment analysis using dictionary methods as a part of the course “Text as Data”
Loading required libraries
Accessing Twitter API tokens
Scraping Timeline and retweets of HONY twitter handle
t_hony <- get_timeline("humansofny", n = 3200, retweets =T )
Printing Tweets
print(t_hony)
# A tibble: 3,199 x 90
user_id status_id created_at screen_name text source
<chr> <chr> <dttm> <chr> <chr> <chr>
1 237548529 15185585768~ 2022-04-25 11:52:32 humansofny "Our~ Twitt~
2 237548529 15158700641~ 2022-04-18 01:49:21 humansofny "@TE~ Twitt~
3 237548529 15085431545~ 2022-03-28 20:34:50 humansofny "(4/~ Twitt~
4 237548529 15085216216~ 2022-03-28 19:09:16 humansofny "(3/~ Twitt~
5 237548529 15085038151~ 2022-03-28 17:58:30 humansofny "(2/~ Twitt~
6 237548529 15084785526~ 2022-03-28 16:18:07 humansofny "(1/~ Twitt~
7 237548529 15026978255~ 2022-03-12 17:27:35 humansofny "@mk~ Twitt~
8 237548529 15026619784~ 2022-03-12 15:05:08 humansofny "@cr~ Twitt~
9 237548529 14995731454~ 2022-03-04 02:31:13 humansofny "(13~ Twitt~
10 237548529 14995592910~ 2022-03-04 01:36:10 humansofny "(12~ Twitt~
# ... with 3,189 more rows, and 84 more variables:
# display_text_width <dbl>, reply_to_status_id <chr>,
# reply_to_user_id <chr>, reply_to_screen_name <chr>,
# is_quote <lgl>, is_retweet <lgl>, favorite_count <int>,
# retweet_count <int>, quote_count <int>, reply_count <int>,
# hashtags <list>, symbols <list>, urls_url <list>,
# urls_t.co <list>, urls_expanded_url <list>, media_url <list>, ...
Preprocessing & Tokenization
# We need to restructure lego as one-token-per-row format
tidy_tweets <- t_hony %>% # pipe data frame
filter(is_retweet==TRUE)%>% # only include original tweets
select(status_id,
text)%>% # select variables of interest
unnest_tokens(word, text) # splits column in one token per row format
tidy_tweets
# A tibble: 3,195 x 2
status_id word
<chr> <chr>
1 1518558576847077378 our
2 1518558576847077378 radio
3 1518558576847077378 podcast
4 1518558576847077378 host
5 1518558576847077378 chionwolf
6 1518558576847077378 will
7 1518558576847077378 be
8 1518558576847077378 sharing
9 1518558576847077378 a
10 1518558576847077378 conversation
# ... with 3,185 more rows
Calling stopwords
stop_words
# A tibble: 1,149 x 2
word lexicon
<chr> <chr>
1 a SMART
2 a's SMART
3 able SMART
4 about SMART
5 above SMART
6 according SMART
7 accordingly SMART
8 across SMART
9 actually SMART
10 after SMART
# ... with 1,139 more rows
Creating a DataFrame
Connecting stopwords to DF
# Connect stop words
all_stop_words <- stop_words %>%
bind_rows(my_stop_words) # here we are connecting two data frames
# Let's see if it worked
view(all_stop_words)
# Remove numbers
tidy_tweets <- tidy_tweets %>%
filter(is.na(as.numeric(word))) # remember filter() returns rows where conditions are true
Converting to vector
# A tibble: 5 x 2
status_id word
<chr> <chr>
1 1518558576847077378 our
2 1518558576847077378 radio
3 1518558576847077378 podcast
4 1518558576847077378 host
5 1518558576847077378 chionwolf
Removing stopwords
Sentiment Analysis of tweets using NRC
nrc <- get_sentiments("nrc")%>% # get specific sentiment lexicons in a tidy format
filter(sentiment %in% c("positive", "negative", "anger", "sadness", "trust", "fear", "disgust", "joy", "surprise"))
view(nrc)
nrc_words <- tweets_final %>%
inner_join(nrc, by="word")
view(nrc_words)
pie_words<- nrc_words %>%
group_by(sentiment) %>% # group by sentiment type
tally %>% # counts number of rows
arrange(desc(n)) # arrange sentiments in descending order based on frequency
ggpubr::ggpie(pie_words, "n", label = "sentiment",
fill = "sentiment", color = "white",
palette = "Spectral")
# NRC:
pie_words %>% group_by(sentiment) %>%
summarize(total=sum(n)) %>%
spread(sentiment, total) %>%
mutate((net.sentiment=(positive+joy+surprise+trust)-(negative+anger+disgust+fear+sadness))) %>%
kable(align = 'r')
| anger | disgust | fear | joy | negative | positive | sadness | surprise | trust | (…) |
|---|---|---|---|---|---|---|---|---|---|
| 23 | 10 | 32 | 80 | 43 | 150 | 27 | 33 | 84 | 212 |
Sentiment Analysis of Tweets using BING
bing <- get_sentiments("bing")%>%
count(word, sentiment, sort=T)
bing
# A tibble: 6,786 x 3
word sentiment n
<chr> <chr> <int>
1 2-faces negative 1
2 abnormal negative 1
3 abolish negative 1
4 abominable negative 1
5 abominably negative 1
6 abominate negative 1
7 abomination negative 1
8 abort negative 1
9 aborted negative 1
10 aborts negative 1
# ... with 6,776 more rows
bing_words <- tweets_final %>%
inner_join(bing, by="word")
view(bing_words)
pie_words<- bing_words %>%
group_by(sentiment) %>% # group by sentiment type
tally %>% # counts number of rows
arrange(desc(n)) # arrange sentiments in descending order based on frequency
ggpubr::ggpie(pie_words, "n", label = "sentiment",
fill = "sentiment", color = "white",
palette = "Spectral")
bing_words %>% group_by(sentiment) %>%
summarize(total=sum(n)) %>%
spread(sentiment, total) %>%
mutate((net.sentiment=positive-negative)) %>%
kable(align = 'l')
| negative | positive | (net.sentiment = positive - negative) |
|---|---|---|
| 37 | 95 | 58 |
Sentiment Analysis of tweets using AFINN
afinn_df <- tweets_final %>% inner_join(get_sentiments("afinn")) %>%
mutate(sentiment = case_when(value < 0 ~ 'negative',
value > 0 ~ 'positive'))
view(afinn_df)
pie_words <- afinn_df %>%
group_by(sentiment) %>% # group by sentiment type
tally %>% # counts number of rows
arrange(desc(n)) # arrange sentiments in descending order based on frequency
ggpubr::ggpie(pie_words, "n", label = "sentiment",
fill = "sentiment", color = "white",
palette = "Spectral")
All the 3 dictionary method outputs show that the positive sentiment is dominating the negative sentiment of the readers who use twitter. The HONY has a good/ positive influence on the readers with the stories it bring to it’s audience.