NLP

packages = c(
  'rtweet',
  'httpuv',
  'tidyverse',
  'rtweet',
  'tidytext',
  'ggwordcloud',
  'reshape2',
  'wordcloud',
  'igraph',
  'ggraph',
  'topicmodels',
  'tm'
)

package.check <- lapply( #by vikram
  packages,
  FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
    }
  }
)

## Loading required package: rtweet

## Warning: package 'rtweet' was built under R version 4.1.3

## Loading required package: httpuv

## Warning: package 'httpuv' was built under R version 4.1.3

## Loading required package: tidyverse

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## Warning: package 'stringr' was built under R version 4.1.3

## Warning: package 'forcats' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag()     masks stats::lag()

## Loading required package: tidytext

## Warning: package 'tidytext' was built under R version 4.1.3

## Loading required package: ggwordcloud

## Warning: package 'ggwordcloud' was built under R version 4.1.3

## Loading required package: reshape2

## Warning: package 'reshape2' was built under R version 4.1.3

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## Loading required package: wordcloud

## Warning: package 'wordcloud' was built under R version 4.1.3

## Loading required package: RColorBrewer

## Loading required package: igraph

## Warning: package 'igraph' was built under R version 4.1.3

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## Loading required package: ggraph

## Warning: package 'ggraph' was built under R version 4.1.3

## Loading required package: topicmodels

## Warning: package 'topicmodels' was built under R version 4.1.3

## Loading required package: tm

## Warning: package 'tm' was built under R version 4.1.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library('rtweet')
library('httpuv')

api_key <- "8GwQcGEWxLAbcHvJBbdlqu7Xf"
api_secret_key <- "tzsZNg9rvDNqrCA3Btc4Gka9iv6W9NkSxo2WfSwsGZIHm8Rnk3"
app_name <- "CenterScrape"
access_token <- "635181580-sD7F6vWdH7gt8kPHh4i90HolDX4aWZNhFZ8Y9DV7"
access_token_secret<-"ZmtuRto1MYnSuUwkPeANh8MVkFcjCtA4YMv0MNooZUhvE"

token <- create_token(
  app = app_name,
  consumer_key = api_key,
  consumer_secret = api_secret_key,
  access_token = access_token,
  access_secret = access_token_secret
)

get_token()

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> CenterScrape
##   key:    8GwQcGEWxLAbcHvJBbdlqu7Xf
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

library('tidyverse')
library('rtweet')
timelineDF <- get_timelines("rajnathsingh")

removeURL <- function(x) gsub("http[[:alnum:][:punct:]]*", "", x)

tweet <- timelineDF %>%
  filter(is_retweet == FALSE) %>% 
  select(text)%>%
  cbind(tweet_id = 50:1)%>%
  rename(tweet = text)%>%
  mutate(tweet = removeURL(tweet))

library('tidytext')
tokenized_tweets <- unnest_tokens(tweet, input = 'tweet', output = 'word')
head(tokenized_tweets)

##   tweet_id              word
## 1       50           visited
## 2       50               the
## 3       50 telecommunication
## 4       50        university
## 5       50                in
## 6       50               nha

tokenized_tweets %>%
  count(word, sort = TRUE) %>%
  rename(count = n) %>%
  filter(count > 5) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(x = count, y = word)) + 
    geom_col()  + 
    labs(title = "Count of Words in rajnathsingh Tweets") + 
    scale_x_continuous(breaks = seq(0, 50, 5))

tokenized_tweets %>%
  anti_join(stop_words) %>% #finds where tweet words overlap with predefined stop words, and removes them
  count(word, sort = TRUE) %>%
  rename(count = n) %>%
  filter(count > 5) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(x = count, y = word)) + 
    geom_col() + 
    labs(title = "Count of Words in rajnathsingh Tweets") + 
    scale_x_continuous(breaks = seq(0, 50, 5))

## Joining, by = "word"

library('ggwordcloud')

tokenized_tweets %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  filter(n > 4) %>%
  ggplot(aes(label = word, size = n, color = n)) + 
    geom_text_wordcloud() + 
    scale_size_area(max_size = 15)

## Joining, by = "word"

get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,862 more rows

tokenized_tweets %>%
  group_by(tweet_id) %>%
  inner_join(get_sentiments("afinn")) %>%
  summarise(mean_sentiment = mean(value)) %>%
  ggplot(aes(x = tweet_id, y = mean_sentiment)) + 
    geom_col() + 
    labs(title = 'Mean Sentiment by Tweet - Afinn Lexicon', x = "Tweet ID", y = 'Mean Sentiment') + 
    scale_x_continuous(breaks = seq(1, 50)) +
    scale_y_continuous(breaks = seq(-1, 3, 0.5))

## Joining, by = "word"

library('reshape2')
library('wordcloud')

tokenized_tweets %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>% #cast into matrix, grouped by neg and pos
  comparison.cloud(colors = c("red", "green"),
                   max.words = 20)

## Joining, by = "word"

tokenized_tweets %>%
  count(word, sort = TRUE) %>%
  rename(count = n) %>%
  mutate(total=sum(count))%>%
  mutate(tf=count/total) %>%
  head()

##               word count total         tf
## 1              the    84  1969 0.04266125
## 2               of    45  1969 0.02285424
## 3 <U+0915><U+0947>    40  1969 0.02031488
## 4              and    40  1969 0.02031488
## 5               to    35  1969 0.01777552
## 6 <U+0914><U+0930>    31  1969 0.01574403

tweet_td_idf <- tokenized_tweets %>%
  count(word, tweet_id, sort = TRUE) %>%
  rename(count = n) %>%
  bind_tf_idf(word, tweet_id, count)

tweet_td_idf %>%
  select(word, tweet_id, tf_idf, count)%>%
  group_by(tweet_id)%>%
  slice_max(order_by = tf_idf, n=6, with_ties = FALSE)%>%
  filter(tweet_id < 6)%>%
  ggplot(aes(label = word))+
  geom_text_wordcloud()+
  facet_grid(rows = vars(tweet_id))

tweets_bigram <- tweet %>%
  unnest_tokens(bigram, tweet, token = 'ngrams', n = 2)


head(tweets_bigram)

##   tweet_id                       bigram
## 1       50                  visited the
## 2       50        the telecommunication
## 3       50 telecommunication university
## 4       50                university in
## 5       50                       in nha
## 6       50                    nha trang

tweets_bigram <- tweets_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")%>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigram_counts <- tweets_bigram %>%
  count(word1, word2, sort = TRUE)

tweet %>%
  unnest_tokens(bigram, tweet, token = 'ngrams', n = 2) %>%
  count(tweet_id, bigram) %>%
  bind_tf_idf(bigram, tweet_id, n) %>%
  group_by(tweet_id) %>%
  arrange(tweet_id, desc(tf_idf)) %>%
  head()

## # A tibble: 6 x 6
## # Groups:   tweet_id [1]
##   tweet_id bigram         n     tf   idf tf_idf
##      <int> <chr>      <int>  <dbl> <dbl>  <dbl>
## 1        1 <U+0905><U+0935><U+0938><U+0930> <U+092A><U+094D><U+0930><U+0926><U+093E><U+0928>     1 0.0204  3.91 0.0798
## 2        1 <U+0905><U+0935><U+0938><U+0930> <U+092E><U+093F><U+0932><U+093E>      1 0.0204  3.91 0.0798
## 3        1 <U+0906><U+092D><U+093E><U+0930> <U+0935><U+094D><U+092F><U+0915><U+094D><U+0924>      1 0.0204  3.91 0.0798
## 4        1 <U+0906><U+092F><U+094B><U+091C><U+093F><U+0924> <U+092F><U+0941><U+0935><U+093E>     1 0.0204  3.91 0.0798
## 5        1 <U+0909><U+0928><U+094D><U+0939><U+094B><U+0902><U+0928><U+0947> <U+092E><U+0941><U+091D><U+0947>       1 0.0204  3.91 0.0798
## 6        1 <U+0914><U+0930> <U+0938><U+0902><U+0938><U+094D><U+0915><U+093E><U+0930>       1 0.0204  3.91 0.0798

library('igraph')
library('ggraph')
bi_graph <- bigram_counts  %>%
  filter(n > 2) %>% 
  graph_from_data_frame()

ggraph(bi_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

tweets_trigram <- tweet %>%
  unnest_tokens(trigram, tweet, token = 'ngrams', n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% #separates on whitespace
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word)

trigram_counts <- tweets_trigram %>%
  count(word1, word2, word3, sort = TRUE)

tweet %>%
  unnest_tokens(trigram, tweet, token = 'ngrams', n = 3) %>%
  count(tweet_id, trigram) %>%
  bind_tf_idf(trigram, tweet_id, n) %>%
  group_by(tweet_id) %>%
  arrange(tweet_id, desc(tf_idf)) %>%
  head()

## # A tibble: 6 x 6
## # Groups:   tweet_id [1]
##   tweet_id trigram                   n     tf   idf tf_idf
##      <int> <chr>                 <int>  <dbl> <dbl>  <dbl>
## 1        1 <U+0905><U+0935><U+0938><U+0930> <U+092A><U+094D><U+0930><U+0926><U+093E><U+0928> <U+0915><U+093F><U+092F><U+093E>           1 0.0208  3.91 0.0815
## 2        1 <U+0905><U+0935><U+0938><U+0930> <U+092E><U+093F><U+0932><U+093E> <U+0938><U+094D><U+0935><U+093E><U+092E><U+0940><U+0928><U+093E><U+0930><U+093E><U+092F><U+0923>     1 0.0208  3.91 0.0815
## 3        1 <U+0906><U+092D><U+093E><U+0930> <U+0935><U+094D><U+092F><U+0915><U+094D><U+0924> <U+0915><U+0930><U+0924><U+093E>            1 0.0208  3.91 0.0815
## 4        1 <U+0906><U+092F><U+094B><U+091C><U+093F><U+0924> <U+092F><U+0941><U+0935><U+093E> <U+0936><U+093F><U+0935><U+093F><U+0930>          1 0.0208  3.91 0.0815
## 5        1 <U+0909><U+0928><U+094D><U+0939><U+094B><U+0902><U+0928><U+0947> <U+092E><U+0941><U+091D><U+0947> <U+092F><U+0939>               1 0.0208  3.91 0.0815
## 6        1 <U+0914><U+0930> <U+0938><U+0902><U+0938><U+094D><U+0915><U+093E><U+0930> <U+0915><U+0940>               1 0.0208  3.91 0.0815

library('igraph')
library('ggraph')
tri_graph <- trigram_counts %>%
  filter(n > 2) %>% 
  graph_from_data_frame()

ggraph(tri_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

library('topicmodels')
library('tm')

#parameters
num_topics=3
top_n_to_get=10

tweets_lda <- tweet_td_idf %>%
  anti_join(stop_words) %>%
  cast_dtm(document = tweet_id, term =  word, value =  count) %>%
  LDA(k=num_topics)

## Joining, by = "word"

tweets_lda

## A LDA_VEM topic model with 3 topics.

tweet_topics <- tidy(tweets_lda) #beta is per-topic-per-word probabilities
head(tweet_topics)

## # A tibble: 6 x 3
##   topic term       beta
##   <int> <chr>     <dbl>
## 1     1 <U+0915><U+0940>    3.28e-  2
## 2     2 <U+0915><U+0940>    7.79e-130
## 3     3 <U+0915><U+0940>    2.44e-  2
## 4     1 <U+0939><U+0948>     5.06e-  2
## 5     2 <U+0939><U+0948>     4.83e-130
## 6     3 <U+0939><U+0948>     1.38e-  2

tweet_topics_top_terms <- tweet_topics %>%
  group_by(topic) %>%
  top_n(top_n_to_get, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

head(tweet_topics_top_terms)

## # A tibble: 6 x 3
##   topic term    beta
##   <int> <chr>  <dbl>
## 1     1 <U+0939><U+0948>     0.0506
## 2     1 <U+0915><U+0947>     0.0347
## 3     1 <U+0915><U+0940>    0.0328
## 4     1 <U+092E><U+0947><U+0902>     0.0309
## 5     1 <U+0914><U+0930>    0.0300
## 6     1 <U+092D><U+093E><U+0930><U+0924>  0.0216

tweet_topics_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

NLP

Akash

2022-06-01