ggplot(
count, aes(y = word, x = n)
) +
geom_col()+
labs(x = "Count of words that occured more than 100 times, in past 50 speeches")
count_eco <- tidy_review2 %>%
filter(word == "economy") %>%
count(word,date, title) %>%
group_by(date) %>%
arrange(desc(n))
ggplot(
count_eco, aes(x = n, y = date)
) +
geom_col() +
labs(x = "Count of Word 'economy' in PM speeches")+
labs(y = "Date of the Speech")
The speeches with maximum usage of word ‘economy’ are the speeches addressed on 15th Aug, 2020 (Independence Day) and on 2nd June 2020. Word count more than 4.
Let’s also analyse the words that are in a way intergeable with ‘economy’ i.e. ‘economic’
count_eco1 <- tidy_review2 %>%
filter(word == "economic") %>%
count(word,date, title) %>%
group_by(date) %>%
arrange(desc(n))
ggplot(
count_eco1, aes(x = n, y = date)
) +
geom_col() +
labs(x = "Count of Word 'economic' in PM speeches")+
labs(y = "Date of the Speech")
Word count more than 4. In this case, word ‘economic’ is used at three occasions - 12 May 2020, 16 Jun 2020, 15 Aug 2020.The usage is highest (13) on May 12, 2020; should be interesting to analyse this speech as well.
Sentiment Analysis of speeches that reflect high usage of ‘economy’ 1. 2nd June 2020 speech dissection - word ‘economy’ 6 times occurence
library(tidyr)
library(textdata)
bing <- get_sentiments("bing")
loughran <- get_sentiments("loughran")
jun2 <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
arrange(desc(n))
## Joining, by = "word"
jun2
## word sentiment n
## 1 opportunities positive 6
## 2 strength positive 4
## 3 crisis negative 3
## 4 poor negative 3
## 5 advantage positive 2
## 6 difficult negative 2
## 7 innovation positive 2
## 8 possibilities uncertainty 2
## 9 question negative 2
## 10 succeed positive 2
## 11 benefit positive 1
## 12 benefited positive 1
## 13 depend constraining 1
## 14 depend uncertainty 1
## 15 dependence uncertainty 1
## 16 difficulty negative 1
## 17 effective positive 1
## 18 empowering positive 1
## 19 enable positive 1
## 20 encouraging positive 1
## 21 interference negative 1
## 22 permission constraining 1
## 23 permitted constraining 1
## 24 pledge constraining 1
## 25 questions negative 1
## 26 random uncertainty 1
## 27 regulations litigious 1
## 28 resolve positive 1
## 29 slowed negative 1
## 30 stabilize positive 1
## 31 strengthening positive 1
## 32 strengthens positive 1
## 33 strict constraining 1
## 34 stringent negative 1
## 35 strong positive 1
## 36 stronger positive 1
## 37 successfully positive 1
## 38 whomsoever litigious 1
## 39 worries negative 1
ggplot(
jun2, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on June 2 2020")
jun2_senti <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)
## Joining, by = "word"
ggplot(jun2_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on Jun 2 2020",
x = "Words"
)
#Contribution to sentiment
jun2_bing <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
filter(n>1) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))
## Joining, by = "word"
ggplot(jun2_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - speech on Jun 2 2020")
Word Cloud_June 2nd PM Speech
library(wordcloud)
library(reshape2)
jun2_cloud <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
jun2_senti %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
jun2_bing %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 100)
TOPIC MODELS - June 2 2020
dtm_review <- tidy_review2 %>%
filter(date == 'Jun 02, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()
#dtm_review[1:4, 2000:2004]
library(topicmodels)
lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
lda_out
## A LDA_Gibbs topic model with 3 topics.
glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:890] 2 3 2 3 3 2 2 3 1 1 ...
## ..@ alpha : num 16.7
## ..@ call : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 1 512
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 3
## ..@ terms : chr [1:512] "125" "150" "200" "24" ...
## ..@ documents : chr "PMâ\200\231s address at the inauguration of Annual Session of Confederation of Indian Industry"
## ..@ beta : num [1:3, 1:512] -8.16 -5.15 -4.67 -8.16 -5.8 ...
## ..@ gamma : num [1, 1:3] 0.335 0.35 0.316
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:512] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ j : int [1:512] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ v : num [1:512] 3 2 2 3 1 2 2 1 1 2 ...
## .. ..$ nrow: int 1
## .. ..$ ncol: int 512
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -5542
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 890
lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))
## # A tibble: 1,536 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 india 0.0576
## 2 1 friends 0.0547
## 3 2 country 0.0443
## 4 3 sector 0.0426
## 5 3 government 0.0365
## 6 2 industry 0.0278
## 7 1 trust 0.0261
## 8 2 decisions 0.0251
## 9 3 sectors 0.0245
## 10 1 â 0.0232
## # ... with 1,526 more rows
word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))
ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on June 2 2020") +
labs(x = "Words that constitute a story")
Sentiment Analysis of speeches that reflect high usage of ‘economy’ 2. 15th Aug 2020 speech dissection - word ‘economy’ 5 times occurence
aug15 <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
filter(n>2) %>%
arrange(desc(n))
## Joining, by = "word"
aug15
## word sentiment n
## 1 strength positive 11
## 2 poor negative 8
## 3 force negative 7
## 4 dream positive 6
## 5 opportunities positive 6
## 6 progress positive 5
## 7 requirements constraining 4
## 8 challenges negative 3
## 9 commitment constraining 3
## 10 crisis negative 3
## 11 lost negative 3
## 12 recall negative 3
## 13 sacrifices negative 3
## 14 strengthen positive 3
## 15 strong positive 3
ggplot(
aug15, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on 15 AUg 2020")
aug15_senti <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)
## Joining, by = "word"
ggplot(aug15_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on 15 Aug 2020",
x = "Words"
)
#Contribution to sentiment
aug15_bing <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
filter(n>2) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))
## Joining, by = "word"
ggplot(aug15_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - Speech on 15 Aug")
Word Cloud_August 15th PM Speech
library(wordcloud)
library(reshape2)
aug15_cloud <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
#aug15_senti %>%
#count(word) %>%
#with(wordcloud(word, n, max.words = 100))
aug15_bing %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 100)
TOPIC MODELS - Aug 15 2020
dtm_review <- tidy_review2 %>%
filter(date == 'Aug 15, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()
lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
lda_out
## A LDA_Gibbs topic model with 3 topics.
glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:2299] 3 1 3 3 2 2 3 1 2 2 ...
## ..@ alpha : num 16.7
## ..@ call : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 1 1104
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 3
## ..@ terms : chr [1:1104] "\210à" "µ" "µà" "1,00,000" ...
## ..@ documents : chr "PMâ\200\231s address to the Nation from the ramparts of the Red Fort"
## ..@ beta : num [1:3, 1:1104] -9.06 -9.11 -6.67 -6.66 -9.11 ...
## ..@ gamma : num [1, 1:3] 0.326 0.344 0.33
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:1104] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ j : int [1:1104] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ v : num [1:1104] 3 1 3 2 2 3 2 1 3 3 ...
## .. ..$ nrow: int 1
## .. ..$ ncol: int 1104
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -15525
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 2299
lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))
## # A tibble: 3,312 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 3 india 0.0714
## 2 2 world 0.0423
## 3 3 country 0.0392
## 4 1 â 0.0327
## 5 2 à 0.0278
## 6 2 countrymen 0.0190
## 7 2 class 0.0179
## 8 2 middle 0.0179
## 9 1 nation 0.0176
## 10 2 corona 0.0168
## # ... with 3,302 more rows
word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))
ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on Aug 15 2020") +
labs(x = "Words that constitute a story")
Sentiment Analysis of speeches that reflect high usage of ‘economic’ 3. 12th May 2020 speech dissection - word ‘economic’ 13 times occurence
may12 <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
filter(n>1) %>%
arrange(desc(n))
## Joining, by = "word"
may12
## word sentiment n
## 1 crisis negative 17
## 2 resolve positive 11
## 3 strength positive 4
## 4 strong positive 4
## 5 happiness positive 3
## 6 poor negative 3
## 7 progress positive 3
## 8 prosperous positive 3
## 9 strengthen positive 3
## 10 confident positive 2
## 11 destroyed negative 2
## 12 opportunity positive 2
## 13 question negative 2
ggplot(
may12, aes(x = n, y = word)
) +
geom_col()+
labs(x = "List of sentiment words in the Speech on 12 May 2020")
may12_senti <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
inner_join(loughran)%>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(
word2 = fct_reorder(word, n)
)
## Joining, by = "word"
ggplot(may12_senti, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts - Speech on 12 May",
x = "Words"
)
#Sentiment Map
may12_bing <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
filter(n>1) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n))
## Joining, by = "word"
ggplot(may12_bing, aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Sentiment Map - Speech on 12 May")
Word Cloud_May 12th PM Speech
library(wordcloud)
library(reshape2)
may12_cloud <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
#may12_senti %>%
#count(word) %>%
#with(wordcloud(word, n, max.words = 100))
may12_bing %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 100)
TOPIC MODELS - May 12 2020
dtm_review <- tidy_review2 %>%
filter(date == 'May 12, 2020') %>%
count(word, title) %>%
cast_dtm(title, word, n) %>%
as.matrix()
lda_out <- LDA(dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
lda_out
## A LDA_Gibbs topic model with 3 topics.
glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:1017] 2 2 2 1 3 3 1 3 3 2 ...
## ..@ alpha : num 16.7
## ..@ call : language LDA(x = dtm_review, k = 3, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 1 528
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 3
## ..@ terms : chr [1:528] "µà" "10" "130" "18" ...
## ..@ documents : chr "PMâ\200\231s address to the Nation on 12.5.2020"
## ..@ beta : num [1:3, 1:528] -8.39 -4.72 -8.26 -5.99 -8.15 ...
## ..@ gamma : num [1, 1:3] 0.379 0.291 0.33
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:528] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ j : int [1:528] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ v : num [1:528] 2 1 3 1 3 2 2 3 2 1 ...
## .. ..$ nrow: int 1
## .. ..$ ncol: int 528
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -6075
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 1017
lda_topics <- lda_out %>%
tidy(matrix = "beta")
lda_topics %>%
arrange(desc(beta))
## # A tibble: 1,584 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 à 0.0842
## 2 2 india 0.0608
## 3 3 â 0.0596
## 4 1 world 0.0433
## 5 3 india 0.0415
## 6 1 crisis 0.0388
## 7 1 reliant 0.0365
## 8 3 friends 0.0364
## 9 1 economic 0.0297
## 10 1 package 0.0297
## # ... with 1,574 more rows
word_probs <- lda_topics %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta))
ggplot(
word_probs,
aes(
term2,
beta,
fill = as.factor(topic)
)
) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()+
labs(y = "Identified Main Three Topics in the Speech on May 12 2020") +
labs(x = "Words that constitute a story")
Refereces: https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html