PM_speech v2

ggplot(
count, aes(y = word, x = n)
) +
geom_col()+
labs(x = "Count of words that occured more than 100 times, in past 50 speeches")

count_eco <- tidy_review2 %>%
filter(word == "economy") %>%
count(word,date, title) %>%  
group_by(date) %>%
arrange(desc(n))

ggplot(
count_eco, aes(x = n, y = date)
) +
geom_col() +
labs(x = "Count of Word 'economy' in PM speeches")+
labs(y = "Date of the Speech")

The speeches with maximum usage of word ‘economy’ are the speeches addressed on 15th Aug, 2020 (Independence Day) and on 2nd June 2020. Word count more than 4.

Let’s also analyse the words that are in a way intergeable with ‘economy’ i.e. ‘economic’

count_eco1 <- tidy_review2 %>%
filter(word == "economic") %>%
count(word,date, title) %>%  
group_by(date) %>%
arrange(desc(n))

ggplot(
count_eco1, aes(x = n, y = date)
) +
geom_col() +
labs(x = "Count of Word 'economic' in PM speeches")+
labs(y = "Date of the Speech")

Word count more than 4. In this case, word ‘economic’ is used at three occasions - 12 May 2020, 16 Jun 2020, 15 Aug 2020.The usage is highest (13) on May 12, 2020; should be interesting to analyse this speech as well.

Sentiment Analysis of speeches that reflect high usage of ‘economy’ 1. 2nd June 2020 speech dissection - word ‘economy’ 6 times occurence

library(tidyr)
library(textdata)
bing <- get_sentiments("bing")
loughran <- get_sentiments("loughran")

jun2 <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
arrange(desc(n))

## Joining, by = "word"

jun2

##             word    sentiment n
## 1  opportunities     positive 6
## 2       strength     positive 4
## 3         crisis     negative 3
## 4           poor     negative 3
## 5      advantage     positive 2
## 6      difficult     negative 2
## 7     innovation     positive 2
## 8  possibilities  uncertainty 2
## 9       question     negative 2
## 10       succeed     positive 2
## 11       benefit     positive 1
## 12     benefited     positive 1
## 13        depend constraining 1
## 14        depend  uncertainty 1
## 15    dependence  uncertainty 1
## 16    difficulty     negative 1
## 17     effective     positive 1
## 18    empowering     positive 1
## 19        enable     positive 1
## 20   encouraging     positive 1
## 21  interference     negative 1
## 22    permission constraining 1
## 23     permitted constraining 1
## 24        pledge constraining 1
## 25     questions     negative 1
## 26        random  uncertainty 1
## 27   regulations    litigious 1
## 28       resolve     positive 1
## 29        slowed     negative 1
## 30     stabilize     positive 1
## 31 strengthening     positive 1
## 32   strengthens     positive 1
## 33        strict constraining 1
## 34     stringent     negative 1
## 35        strong     positive 1
## 36      stronger     positive 1
## 37  successfully     positive 1
## 38    whomsoever    litigious 1
## 39       worries     negative 1

ggplot(
jun2, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on June 2 2020")

jun2_senti <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)

## Joining, by = "word"

ggplot(jun2_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on Jun 2 2020",
x = "Words"
)

#Contribution to sentiment
jun2_bing <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%  
inner_join(bing) %>%  
count(word, sentiment, sort = TRUE) %>%
filter(n>1) %>% 
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))

## Joining, by = "word"

ggplot(jun2_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - speech on Jun 2 2020")

Word Cloud_June 2nd PM Speech

library(wordcloud)
library(reshape2)

jun2_cloud <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%  
count(word) %>%
with(wordcloud(word, n, max.words = 100))

jun2_senti %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))

jun2_bing %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100)

TOPIC MODELS - June 2 2020

dtm_review <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()  

#dtm_review[1:4, 2000:2004]

library(topicmodels)
lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))

lda_out

## A LDA_Gibbs topic model with 3 topics.

glimpse(lda_out)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:890] 2 3 2 3 3 2 2 3 1 1 ...
##   ..@ alpha          : num 16.7
##   ..@ call           : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 1 512
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 3
##   ..@ terms          : chr [1:512] "125" "150" "200" "24" ...
##   ..@ documents      : chr "PMâ\200\231s address at the inauguration of Annual Session of Confederation of Indian Industry"
##   ..@ beta           : num [1:3, 1:512] -8.16 -5.15 -4.67 -8.16 -5.8 ...
##   ..@ gamma          : num [1, 1:3] 0.335 0.35 0.316
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:512] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ j   : int [1:512] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ v   : num [1:512] 3 2 2 3 1 2 2 1 1 2 ...
##   .. ..$ nrow: int 1
##   .. ..$ ncol: int 512
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -5542
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 890

lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))

## # A tibble: 1,536 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 india      0.0576
##  2     1 friends    0.0547
##  3     2 country    0.0443
##  4     3 sector     0.0426
##  5     3 government 0.0365
##  6     2 industry   0.0278
##  7     1 trust      0.0261
##  8     2 decisions  0.0251
##  9     3 sectors    0.0245
## 10     1 â          0.0232
## # ... with 1,526 more rows

word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))

ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on June 2 2020") +
labs(x = "Words that constitute a story")

Sentiment Analysis of speeches that reflect high usage of ‘economy’ 2. 15th Aug 2020 speech dissection - word ‘economy’ 5 times occurence

aug15 <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
filter(n>2) %>%   
arrange(desc(n))

## Joining, by = "word"

aug15

##             word    sentiment  n
## 1       strength     positive 11
## 2           poor     negative  8
## 3          force     negative  7
## 4          dream     positive  6
## 5  opportunities     positive  6
## 6       progress     positive  5
## 7   requirements constraining  4
## 8     challenges     negative  3
## 9     commitment constraining  3
## 10        crisis     negative  3
## 11          lost     negative  3
## 12        recall     negative  3
## 13    sacrifices     negative  3
## 14    strengthen     positive  3
## 15        strong     positive  3

ggplot(
aug15, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on 15 AUg 2020")

aug15_senti <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)

## Joining, by = "word"

ggplot(aug15_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on 15 Aug 2020",
x = "Words"
)

#Contribution to sentiment
aug15_bing <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%  
inner_join(bing) %>%  
count(word, sentiment, sort = TRUE) %>%
filter(n>2) %>% 
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))

## Joining, by = "word"

ggplot(aug15_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - Speech on 15 Aug")

Word Cloud_August 15th PM Speech

library(wordcloud)
library(reshape2)

aug15_cloud <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%  
count(word) %>%
with(wordcloud(word, n, max.words = 100))

#aug15_senti %>%
#count(word) %>%
#with(wordcloud(word, n, max.words = 100))

aug15_bing %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100)

TOPIC MODELS - Aug 15 2020

dtm_review <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()  

lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))

lda_out

## A LDA_Gibbs topic model with 3 topics.

glimpse(lda_out)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:2299] 3 1 3 3 2 2 3 1 2 2 ...
##   ..@ alpha          : num 16.7
##   ..@ call           : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 1 1104
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 3
##   ..@ terms          : chr [1:1104] "\210à" "µ" "µà" "1,00,000" ...
##   ..@ documents      : chr "PMâ\200\231s address to the Nation from the ramparts of the Red Fort"
##   ..@ beta           : num [1:3, 1:1104] -9.06 -9.11 -6.67 -6.66 -9.11 ...
##   ..@ gamma          : num [1, 1:3] 0.326 0.344 0.33
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:1104] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ j   : int [1:1104] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ v   : num [1:1104] 3 1 3 2 2 3 2 1 3 3 ...
##   .. ..$ nrow: int 1
##   .. ..$ ncol: int 1104
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -15525
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 2299

lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))

## # A tibble: 3,312 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     3 india      0.0714
##  2     2 world      0.0423
##  3     3 country    0.0392
##  4     1 â          0.0327
##  5     2 à          0.0278
##  6     2 countrymen 0.0190
##  7     2 class      0.0179
##  8     2 middle     0.0179
##  9     1 nation     0.0176
## 10     2 corona     0.0168
## # ... with 3,302 more rows

word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))

ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on Aug 15 2020") +
labs(x = "Words that constitute a story")

Sentiment Analysis of speeches that reflect high usage of ‘economic’ 3. 12th May 2020 speech dissection - word ‘economic’ 13 times occurence

may12 <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
filter(n>1) %>%  
arrange(desc(n))

## Joining, by = "word"

may12

##           word sentiment  n
## 1       crisis  negative 17
## 2      resolve  positive 11
## 3     strength  positive  4
## 4       strong  positive  4
## 5    happiness  positive  3
## 6         poor  negative  3
## 7     progress  positive  3
## 8   prosperous  positive  3
## 9   strengthen  positive  3
## 10   confident  positive  2
## 11   destroyed  negative  2
## 12 opportunity  positive  2
## 13    question  negative  2

ggplot(
may12, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on 12 May 2020")

may12_senti <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)

## Joining, by = "word"

ggplot(may12_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on 12 May",
x = "Words"
)

#Sentiment Map
may12_bing <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%  
inner_join(bing) %>%  
count(word, sentiment, sort = TRUE) %>%
filter(n>1) %>% 
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))

## Joining, by = "word"

ggplot(may12_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - Speech on 12 May")

Word Cloud_May 12th PM Speech

library(wordcloud)
library(reshape2)

may12_cloud <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%  
count(word) %>%
with(wordcloud(word, n, max.words = 100))

#may12_senti %>%
#count(word) %>%
#with(wordcloud(word, n, max.words = 100))

may12_bing %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100)

TOPIC MODELS - May 12 2020

dtm_review <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()  

lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))

lda_out

## A LDA_Gibbs topic model with 3 topics.

glimpse(lda_out)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:1017] 2 2 2 1 3 3 1 3 3 2 ...
##   ..@ alpha          : num 16.7
##   ..@ call           : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 1 528
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 3
##   ..@ terms          : chr [1:528] "µà" "10" "130" "18" ...
##   ..@ documents      : chr "PMâ\200\231s address to the Nation on 12.5.2020"
##   ..@ beta           : num [1:3, 1:528] -8.39 -4.72 -8.26 -5.99 -8.15 ...
##   ..@ gamma          : num [1, 1:3] 0.379 0.291 0.33
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:528] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ j   : int [1:528] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ v   : num [1:528] 2 1 3 1 3 2 2 3 2 1 ...
##   .. ..$ nrow: int 1
##   .. ..$ ncol: int 528
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -6075
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 1017

lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))

## # A tibble: 1,584 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 à        0.0842
##  2     2 india    0.0608
##  3     3 â        0.0596
##  4     1 world    0.0433
##  5     3 india    0.0415
##  6     1 crisis   0.0388
##  7     1 reliant  0.0365
##  8     3 friends  0.0364
##  9     1 economic 0.0297
## 10     1 package  0.0297
## # ... with 1,574 more rows

word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))

ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on May 12 2020") +
labs(x = "Words that constitute a story")

Refereces: https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

PM_speech v2

Ramya Emandi

11/09/2020

R Markdown