packages = c(
'rtweet',
'httpuv',
'tidyverse',
'rtweet',
'tidytext',
'ggwordcloud',
'reshape2',
'wordcloud',
'igraph',
'ggraph',
'topicmodels',
'tm'
)
package.check <- lapply( #by vikram
packages,
FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
}
}
)
## Loading required package: rtweet
## Warning: package 'rtweet' was built under R version 4.1.3
## Loading required package: httpuv
## Warning: package 'httpuv' was built under R version 4.1.3
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag() masks stats::lag()
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.1.3
## Loading required package: ggwordcloud
## Warning: package 'ggwordcloud' was built under R version 4.1.3
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 4.1.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.1.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## Loading required package: ggraph
## Warning: package 'ggraph' was built under R version 4.1.3
## Loading required package: topicmodels
## Warning: package 'topicmodels' was built under R version 4.1.3
## Loading required package: tm
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library('rtweet')
library('httpuv')
api_key <- "8GwQcGEWxLAbcHvJBbdlqu7Xf"
api_secret_key <- "tzsZNg9rvDNqrCA3Btc4Gka9iv6W9NkSxo2WfSwsGZIHm8Rnk3"
app_name <- "CenterScrape"
access_token <- "635181580-sD7F6vWdH7gt8kPHh4i90HolDX4aWZNhFZ8Y9DV7"
access_token_secret<-"ZmtuRto1MYnSuUwkPeANh8MVkFcjCtA4YMv0MNooZUhvE"
token <- create_token(
app = app_name,
consumer_key = api_key,
consumer_secret = api_secret_key,
access_token = access_token,
access_secret = access_token_secret
)
get_token()
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> CenterScrape
## key: 8GwQcGEWxLAbcHvJBbdlqu7Xf
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
library('tidyverse')
library('rtweet')
timelineDF <- get_timelines("rajnathsingh")
removeURL <- function(x) gsub("http[[:alnum:][:punct:]]*", "", x)
tweet <- timelineDF %>%
filter(is_retweet == FALSE) %>%
select(text)%>%
cbind(tweet_id = 50:1)%>%
rename(tweet = text)%>%
mutate(tweet = removeURL(tweet))
library('tidytext')
tokenized_tweets <- unnest_tokens(tweet, input = 'tweet', output = 'word')
head(tokenized_tweets)
## tweet_id word
## 1 50 visited
## 2 50 the
## 3 50 telecommunication
## 4 50 university
## 5 50 in
## 6 50 nha
tokenized_tweets %>%
count(word, sort = TRUE) %>%
rename(count = n) %>%
filter(count > 5) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x = count, y = word)) +
geom_col() +
labs(title = "Count of Words in rajnathsingh Tweets") +
scale_x_continuous(breaks = seq(0, 50, 5))

tokenized_tweets %>%
anti_join(stop_words) %>% #finds where tweet words overlap with predefined stop words, and removes them
count(word, sort = TRUE) %>%
rename(count = n) %>%
filter(count > 5) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x = count, y = word)) +
geom_col() +
labs(title = "Count of Words in rajnathsingh Tweets") +
scale_x_continuous(breaks = seq(0, 50, 5))
## Joining, by = "word"

library('ggwordcloud')
tokenized_tweets %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
filter(n > 4) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud() +
scale_size_area(max_size = 15)
## Joining, by = "word"

get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,862 more rows
tokenized_tweets %>%
group_by(tweet_id) %>%
inner_join(get_sentiments("afinn")) %>%
summarise(mean_sentiment = mean(value)) %>%
ggplot(aes(x = tweet_id, y = mean_sentiment)) +
geom_col() +
labs(title = 'Mean Sentiment by Tweet - Afinn Lexicon', x = "Tweet ID", y = 'Mean Sentiment') +
scale_x_continuous(breaks = seq(1, 50)) +
scale_y_continuous(breaks = seq(-1, 3, 0.5))
## Joining, by = "word"

library('reshape2')
library('wordcloud')
tokenized_tweets %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% #cast into matrix, grouped by neg and pos
comparison.cloud(colors = c("red", "green"),
max.words = 20)
## Joining, by = "word"

tokenized_tweets %>%
count(word, sort = TRUE) %>%
rename(count = n) %>%
mutate(total=sum(count))%>%
mutate(tf=count/total) %>%
head()
## word count total tf
## 1 the 84 1969 0.04266125
## 2 of 45 1969 0.02285424
## 3 <U+0915><U+0947> 40 1969 0.02031488
## 4 and 40 1969 0.02031488
## 5 to 35 1969 0.01777552
## 6 <U+0914><U+0930> 31 1969 0.01574403
tweet_td_idf <- tokenized_tweets %>%
count(word, tweet_id, sort = TRUE) %>%
rename(count = n) %>%
bind_tf_idf(word, tweet_id, count)
tweet_td_idf %>%
select(word, tweet_id, tf_idf, count)%>%
group_by(tweet_id)%>%
slice_max(order_by = tf_idf, n=6, with_ties = FALSE)%>%
filter(tweet_id < 6)%>%
ggplot(aes(label = word))+
geom_text_wordcloud()+
facet_grid(rows = vars(tweet_id))

tweets_bigram <- tweet %>%
unnest_tokens(bigram, tweet, token = 'ngrams', n = 2)
head(tweets_bigram)
## tweet_id bigram
## 1 50 visited the
## 2 50 the telecommunication
## 3 50 telecommunication university
## 4 50 university in
## 5 50 in nha
## 6 50 nha trang
tweets_bigram <- tweets_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- tweets_bigram %>%
count(word1, word2, sort = TRUE)
tweet %>%
unnest_tokens(bigram, tweet, token = 'ngrams', n = 2) %>%
count(tweet_id, bigram) %>%
bind_tf_idf(bigram, tweet_id, n) %>%
group_by(tweet_id) %>%
arrange(tweet_id, desc(tf_idf)) %>%
head()
## # A tibble: 6 x 6
## # Groups: tweet_id [1]
## tweet_id bigram n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 <U+0905><U+0935><U+0938><U+0930> <U+092A><U+094D><U+0930><U+0926><U+093E><U+0928> 1 0.0204 3.91 0.0798
## 2 1 <U+0905><U+0935><U+0938><U+0930> <U+092E><U+093F><U+0932><U+093E> 1 0.0204 3.91 0.0798
## 3 1 <U+0906><U+092D><U+093E><U+0930> <U+0935><U+094D><U+092F><U+0915><U+094D><U+0924> 1 0.0204 3.91 0.0798
## 4 1 <U+0906><U+092F><U+094B><U+091C><U+093F><U+0924> <U+092F><U+0941><U+0935><U+093E> 1 0.0204 3.91 0.0798
## 5 1 <U+0909><U+0928><U+094D><U+0939><U+094B><U+0902><U+0928><U+0947> <U+092E><U+0941><U+091D><U+0947> 1 0.0204 3.91 0.0798
## 6 1 <U+0914><U+0930> <U+0938><U+0902><U+0938><U+094D><U+0915><U+093E><U+0930> 1 0.0204 3.91 0.0798
library('igraph')
library('ggraph')
bi_graph <- bigram_counts %>%
filter(n > 2) %>%
graph_from_data_frame()
ggraph(bi_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

tweets_trigram <- tweet %>%
unnest_tokens(trigram, tweet, token = 'ngrams', n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% #separates on whitespace
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
trigram_counts <- tweets_trigram %>%
count(word1, word2, word3, sort = TRUE)
tweet %>%
unnest_tokens(trigram, tweet, token = 'ngrams', n = 3) %>%
count(tweet_id, trigram) %>%
bind_tf_idf(trigram, tweet_id, n) %>%
group_by(tweet_id) %>%
arrange(tweet_id, desc(tf_idf)) %>%
head()
## # A tibble: 6 x 6
## # Groups: tweet_id [1]
## tweet_id trigram n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 <U+0905><U+0935><U+0938><U+0930> <U+092A><U+094D><U+0930><U+0926><U+093E><U+0928> <U+0915><U+093F><U+092F><U+093E> 1 0.0208 3.91 0.0815
## 2 1 <U+0905><U+0935><U+0938><U+0930> <U+092E><U+093F><U+0932><U+093E> <U+0938><U+094D><U+0935><U+093E><U+092E><U+0940><U+0928><U+093E><U+0930><U+093E><U+092F><U+0923> 1 0.0208 3.91 0.0815
## 3 1 <U+0906><U+092D><U+093E><U+0930> <U+0935><U+094D><U+092F><U+0915><U+094D><U+0924> <U+0915><U+0930><U+0924><U+093E> 1 0.0208 3.91 0.0815
## 4 1 <U+0906><U+092F><U+094B><U+091C><U+093F><U+0924> <U+092F><U+0941><U+0935><U+093E> <U+0936><U+093F><U+0935><U+093F><U+0930> 1 0.0208 3.91 0.0815
## 5 1 <U+0909><U+0928><U+094D><U+0939><U+094B><U+0902><U+0928><U+0947> <U+092E><U+0941><U+091D><U+0947> <U+092F><U+0939> 1 0.0208 3.91 0.0815
## 6 1 <U+0914><U+0930> <U+0938><U+0902><U+0938><U+094D><U+0915><U+093E><U+0930> <U+0915><U+0940> 1 0.0208 3.91 0.0815
library('igraph')
library('ggraph')
tri_graph <- trigram_counts %>%
filter(n > 2) %>%
graph_from_data_frame()
ggraph(tri_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

library('topicmodels')
library('tm')
#parameters
num_topics=3
top_n_to_get=10
tweets_lda <- tweet_td_idf %>%
anti_join(stop_words) %>%
cast_dtm(document = tweet_id, term = word, value = count) %>%
LDA(k=num_topics)
## Joining, by = "word"
tweets_lda
## A LDA_VEM topic model with 3 topics.
tweet_topics <- tidy(tweets_lda) #beta is per-topic-per-word probabilities
head(tweet_topics)
## # A tibble: 6 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 <U+0915><U+0940> 3.28e- 2
## 2 2 <U+0915><U+0940> 7.79e-130
## 3 3 <U+0915><U+0940> 2.44e- 2
## 4 1 <U+0939><U+0948> 5.06e- 2
## 5 2 <U+0939><U+0948> 4.83e-130
## 6 3 <U+0939><U+0948> 1.38e- 2
tweet_topics_top_terms <- tweet_topics %>%
group_by(topic) %>%
top_n(top_n_to_get, beta) %>%
ungroup() %>%
arrange(topic, -beta)
head(tweet_topics_top_terms)
## # A tibble: 6 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 <U+0939><U+0948> 0.0506
## 2 1 <U+0915><U+0947> 0.0347
## 3 1 <U+0915><U+0940> 0.0328
## 4 1 <U+092E><U+0947><U+0902> 0.0309
## 5 1 <U+0914><U+0930> 0.0300
## 6 1 <U+092D><U+093E><U+0930><U+0924> 0.0216
tweet_topics_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
