This is my 4th BlogPost for the course Text as Data. In this BlogPost I would like to cover the topic “Sentiment Analysis” . For this Blog post I’m using the tweets on a specific topic i.e . Tweets on a new movie “Kashmirfiles”. I have extracted nearly 10k tweets using my twitter API Id.
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
twitter_data <- read_csv("twitter_data.csv")
head(twitter_data)
# A tibble: 6 x 90
user_id status_id created_at screen_name text source
<chr> <chr> <dttm> <chr> <chr> <chr>
1 x77042467055~ x1507185~ 2022-03-25 02:38:26 Badh_Badh_~ "#Ka~ Twitt~
2 x77042467055~ x1507175~ 2022-03-25 01:59:56 Badh_Badh_~ "@Ad~ Twitt~
3 x77042467055~ x1507185~ 2022-03-25 02:38:21 Badh_Badh_~ "“I ~ Twitt~
4 x77042467055~ x1507181~ 2022-03-25 02:24:18 Badh_Badh_~ "I h~ Twitt~
5 x378010643 x1507185~ 2022-03-25 02:38:25 VishalMule~ "@Ar~ Twitt~
6 x13862304082~ x1507185~ 2022-03-25 02:38:23 AmitSha017~ "Del~ Twitt~
# ... with 84 more variables: display_text_width <dbl>,
# reply_to_status_id <chr>, reply_to_user_id <chr>,
# reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
# favorite_count <dbl>, retweet_count <dbl>, quote_count <lgl>,
# reply_count <lgl>, hashtags <chr>, symbols <lgl>, urls_url <chr>,
# urls_t.co <chr>, urls_expanded_url <chr>, media_url <chr>,
# media_t.co <chr>, media_expanded_url <chr>, media_type <chr>, ...
colnames(twitter_data)
[1] "user_id" "status_id"
[3] "created_at" "screen_name"
[5] "text" "source"
[7] "display_text_width" "reply_to_status_id"
[9] "reply_to_user_id" "reply_to_screen_name"
[11] "is_quote" "is_retweet"
[13] "favorite_count" "retweet_count"
[15] "quote_count" "reply_count"
[17] "hashtags" "symbols"
[19] "urls_url" "urls_t.co"
[21] "urls_expanded_url" "media_url"
[23] "media_t.co" "media_expanded_url"
[25] "media_type" "ext_media_url"
[27] "ext_media_t.co" "ext_media_expanded_url"
[29] "ext_media_type" "mentions_user_id"
[31] "mentions_screen_name" "lang"
[33] "quoted_status_id" "quoted_text"
[35] "quoted_created_at" "quoted_source"
[37] "quoted_favorite_count" "quoted_retweet_count"
[39] "quoted_user_id" "quoted_screen_name"
[41] "quoted_name" "quoted_followers_count"
[43] "quoted_friends_count" "quoted_statuses_count"
[45] "quoted_location" "quoted_description"
[47] "quoted_verified" "retweet_status_id"
[49] "retweet_text" "retweet_created_at"
[51] "retweet_source" "retweet_favorite_count"
[53] "retweet_retweet_count" "retweet_user_id"
[55] "retweet_screen_name" "retweet_name"
[57] "retweet_followers_count" "retweet_friends_count"
[59] "retweet_statuses_count" "retweet_location"
[61] "retweet_description" "retweet_verified"
[63] "place_url" "place_name"
[65] "place_full_name" "place_type"
[67] "country" "country_code"
[69] "geo_coords" "coords_coords"
[71] "bbox_coords" "status_url"
[73] "name" "location"
[75] "description" "url"
[77] "protected" "followers_count"
[79] "friends_count" "listed_count"
[81] "statuses_count" "favourites_count"
[83] "account_created_at" "verified"
[85] "profile_url" "profile_expanded_url"
[87] "account_lang" "profile_banner_url"
[89] "profile_background_url" "profile_image_url"
Data consists 90 columns , Out of these 90 columns we focus on the mail columns like text , source , created_at . These three columns describe tweet , source of the tweet and time of the tweet.
# A tibble: 15 x 2
source n
<chr> <int>
1 Twitter for Android 5849
2 Twitter for iPhone 1323
3 Twitter Web App 1171
4 Twitter for iPad 36
5 TweetDeck 5
6 KOSHUR_BOT 3
7 TweetCaster for Android 3
8 All India Tweet 2
9 tweeper 2
10 Buffer 1
11 Fenix 2 1
12 fff-bot 1
13 IFTTT 1
14 LinkedIn 1
15 RT BOT Adyaveer 1
Most number of the tweets are from the Android source followed by the iPhone and Web App . As majority of the tweets are from Andriod and Iphone , we now focus on those tweets.
head(cleaned_tweets)
# A tibble: 6 x 4
user_id source text created_at
<chr> <chr> <chr> <dttm>
1 x378010643 Android "@ArvindKejriwal’s~ 2022-03-25 02:38:25
2 x1386230408228970497 Android "Delhi CM @ArvindK~ 2022-03-25 02:38:23
3 x1386230408228970497 Android "Truth about #Kash~ 2022-03-24 18:25:35
4 x1386230408228970497 Android "#KashmirFiles is ~ 2022-03-24 18:36:12
5 x1457558317790756872 iPhone "“Every Hindu chil~ 2022-03-25 02:38:17
6 x51540579 Android "“Every Hindu chil~ 2022-03-25 02:38:05
# Load the packages
library(ggplot2)
# Plot the percentage of tweets by hour of the day for each device
cleaned_tweets %>%
count(source, hour = hour(with_tz(created_at, "EST"))) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(hour, percent, color = source)) +
geom_line() +
scale_y_continuous(labels = scales::label_percent()) +
labs(x = "Hour of day (EST)",
y = "% of tweets",
color = "")
library(stringr)
tweet_picture_counts <- cleaned_tweets %>%
filter(!str_detect(text, '^"')) %>%
count(source,
picture = ifelse(str_detect(text, "t.co"),
"Picture/link", "No picture/link"))
ggplot(tweet_picture_counts, aes(source, n, fill = picture)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Number of tweets", fill = "")
library(tidytext)
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- cleaned_tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
head(tweet_words)
# A tibble: 6 x 4
user_id source created_at word
<chr> <chr> <dttm> <chr>
1 x378010643 Android 2022-03-25 02:38:25 @arvindkejriwal
2 x378010643 Android 2022-03-25 02:38:25 mins
3 x378010643 Android 2022-03-25 02:38:25 fame
4 x378010643 Android 2022-03-25 02:38:25 @vivekagnihotri
5 x378010643 Android 2022-03-25 02:38:25 #kashmirfiles
6 x1386230408228970497 Android 2022-03-25 02:38:23 delhi
tweet_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat = "identity") +
ylab("Occurrences") +
coord_flip()
android_iphone_ratios <- tweet_words %>%
count(word, source) %>%
group_by(word) %>%
filter(sum(n) >= 5) %>%
spread(source, n, fill = 0) %>%
ungroup() %>%
mutate_if(is.numeric, ~((. + 1) / sum(. + 1))) %>%
mutate(logratio = log2(Android / iPhone)) %>%
arrange(desc(logratio))
head(android_iphone_ratios)
# A tibble: 6 x 4
word Android iPhone logratio
<chr> <dbl> <dbl> <dbl>
1 iiojk 0.000511 0.0000456 3.49
2 094b 0.000305 0.0000456 2.74
3 0001f621 0.000272 0.0000456 2.57
4 094d 0.000533 0.0000913 2.55
5 leave 0.000261 0.0000456 2.52
6 divide 0.000239 0.0000456 2.39
#There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion in text.Few of them are: 1. AFINN
2.BING
3.NRC
The function get_sentiments() allows us to get specific sentiment lexicons with the appropriate measures for each one.
get_sentiments("afinn")
# A tibble: 2,477 x 2
word value
<chr> <dbl>
1 abandon -2
2 abandoned -2
3 abandons -2
4 abducted -2
5 abduction -2
6 abductions -2
7 abhor -3
8 abhorred -3
9 abhorrent -3
10 abhors -3
# ... with 2,467 more rows
get_sentiments("bing")
# A tibble: 6,786 x 2
word sentiment
<chr> <chr>
1 2-faces negative
2 abnormal negative
3 abolish negative
4 abominable negative
5 abominably negative
6 abominate negative
7 abomination negative
8 abort negative
9 aborted negative
10 aborts negative
# ... with 6,776 more rows
get_sentiments("nrc")
# A tibble: 13,875 x 2
word sentiment
<chr> <chr>
1 abacus trust
2 abandon fear
3 abandon negative
4 abandon sadness
5 abandoned anger
6 abandoned fear
7 abandoned negative
8 abandoned sadness
9 abandonment anger
10 abandonment fear
# ... with 13,865 more rows
Bing consists two type of sentiments (positive and negative) , Afinn gives the sentiment value for the word, Nrc gives the emotion of the word like trust , fear, negative, positive, anger.
nrc <- get_sentiments("nrc")
android_iphone_sentiment <- android_iphone_ratios %>%
inner_join(nrc, by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
mutate(sentiment = reorder(sentiment, -logratio),
word = reorder(word, -logratio)) %>%
group_by(sentiment) %>%
top_n(10, abs(logratio)) %>%
ungroup()
head(android_iphone_sentiment)
# A tibble: 6 x 5
word Android iPhone logratio sentiment
<fct> <dbl> <dbl> <dbl> <fct>
1 leave 0.000261 0.0000456 2.52 sadness
2 leave 0.000261 0.0000456 2.52 surprise
3 lines 0.000218 0.0000456 2.25 fear
4 plan 0.000207 0.0000456 2.18 anticipation
5 rating 0.000207 0.0000456 2.18 anger
6 rating 0.000207 0.0000456 2.18 fear
ggplot(android_iphone_sentiment, aes(word, logratio, fill = logratio < 0)) +
facet_wrap(~ sentiment, scales = "free", nrow = 2) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "", y = "Android / iPhone log ratio") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))