citation:http://www.sthda.com/english/wiki/word-cloud-generator-in-r-one-killer-fun ction-to-do-everything-you-need
https://programminghistorian.org/en/lessons/basic-text-processing-in-r#fn:4 ################################################################################

Introduction

this text data set is from https://businessday.ng/news/legal-business/article/full-text-of-inaugural-speech-by-president-tinubu/. The first step in analyzing the entire adress corpus is to load all your libraries needed for the analysis. this will be followed by loading your dataset using the path of your file.

Exploratory Analysis

Load required package needed

#install.packages('tidyverse') #only install packages once
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

#install.packages('tidytext') #only install packages once
library(tidytext)
#install.packages('SnowballC') #only install packages once
library(SnowballC)
#install.packages('wordcloud') #only install packages once
library(wordcloud)

## Loading required package: RColorBrewer

#install.packages('Rcpp') #actually, this package may need to be updated
library(Rcpp)

Load in the data set

file_path <- (r"(C:\Users\ebene\Desktop\text analysis\PresidentTinubu.txt)")
file_size <-file.info(r'(C:\Users\ebene\Desktop\text analysis\PresidentTinubu.txt)')$size
tinu<- readChar(file_path, file_size)

look at the text speech

# Printing number of columns
length(tinu)

## [1] 1

# explore
summary(tinu)

##    Length     Class      Mode 
##         1 character character

lets extracts a substring from the tinu data from text number 1 up to 400th the second code specified the package used

# view
tinu %>% str_sub(1, 400)

## [1] "My Fellow Citizens,\r\n\r\nI stand before you honoured to assume the sacred mandate you have given me. My love for this nation is abiding. My confidence in its people, unwavering. And my faith in God Almighty, absolute. I know that His hand shall provide the needed moral strength and clarity of purpose in those instances when we seem to have reached the limits of our human capacity.\r\n\r\nThis day is bol"

#saveDoc %>% stringr::str_sub(1, 550)

Look for keywords

when the list output is true it tells us we have the key word, when it is false, the key word is missing. the second code tells us the number of matches we got in there. the third code tells us where the key words where mention and the last code help us to see the surrounding text around the key words

# detect keywords
tinu %>% stringr::str_detect(c('Democracy', 'democracy', 'Security', 'security', 'Nation', 'nation', 'Unity', 'unity', 'Diversity', 'diversity','Agriculture', 'agriculture','Terrorism', 'terrorism', 'Job', 'job', 'Education', 'education'))

##  [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE
## [13] FALSE FALSE  TRUE  TRUE FALSE  TRUE

# count the number of matches of a substring
#Security, Diversity, Job, Agricuture, Unity, all plays  acrucial role in the development and the existance of a nation 
tinu %>% stringr::str_count("Security")

## [1] 1

tinu %>% stringr::str_count("security")

## [1] 4

tinu %>% stringr::str_count("Diversity")

## [1] 0

tinu %>% stringr::str_count("diversity")

## [1] 1

tinu %>% stringr::str_count("Job")

## [1] 1

tinu %>% stringr::str_count("job")

## [1] 4

tinu %>% stringr::str_count("Education")

## [1] 0

tinu %>% stringr::str_count("education")

## [1] 1

sum of all the key words listed cap and uncap: Security:5, diversity: 1, job: 5 and education: 1.

# Where is this keyword mentioned?
tinu %>% stringr::str_locate_all('job')

## [[1]]
##      start   end
## [1,]  6547  6549
## [2,]  8396  8398
## [3,]  9987  9989
## [4,] 10270 10272

# View surrounding text (requires regex)
tinu %>% stringr::str_extract_all(".{50}(job).{50}")

## [[1]]
## [1] "omy to bring about growth and development through job creation, food security and an end of extreme pov"
## [2] "public infrastructure, education, health care and jobs that will materially improve the lives of millio"

# Count characters

tinu %>% str_count()

## [1] 12026

#save %>% stringr::str_count()

Tokenize the text

# Change to a tibble (tidy dataframe)
tokens_speech <- tibble(tinu)

# Tokenize
tokens_speech <- tokens_speech %>% tidytext::unnest_tokens(output=word, input=tinu, token='words', to_lower=TRUE)

tokenizing splits the corpus into words using the tokenize_word function. In this case the result is a list with 1993 items in it, each representing a specific word.

#add order of the words
tokens_speech <- tokens_speech %>% mutate(order = row_number())

# Count tokens
tokens_speech %>% nrow()

## [1] 1993

lets view few words

# First few words
tokens_speech[1:30, ]

## # A tibble: 30 × 2
##    word     order
##    <chr>    <int>
##  1 my           1
##  2 fellow       2
##  3 citizens     3
##  4 i            4
##  5 stand        5
##  6 before       6
##  7 you          7
##  8 honoured     8
##  9 to           9
## 10 assume      10
## # ℹ 20 more rows

# count the number of matches of a substring
tokens_speech %>% dplyr::filter(word == str_sub('nigeria')) %>% count()

## # A tibble: 1 × 1
##       n
##   <int>
## 1    11

# Where is this keyword mentioned?
tokens_speech %>% dplyr::filter(word == str_sub('nigeria'))

## # A tibble: 11 × 2
##    word    order
##    <chr>   <int>
##  1 nigeria   432
##  2 nigeria   592
##  3 nigeria   699
##  4 nigeria   756
##  5 nigeria  1080
##  6 nigeria  1857
##  7 nigeria  1882
##  8 nigeria  1886
##  9 nigeria  1897
## 10 nigeria  1930
## 11 nigeria  1982

Remove stop words

# Look at the most important frequent words
tokens_speech %>% 
  group_by(word) %>% 
  summarize(count = n()) %>%
  arrange(desc(count)) %>% 
  filter(count > 10) %>% 
  mutate(tokens_speech = reorder(word, count)) %>% 
  ggplot(aes(x=count, y=tokens_speech)) +
  geom_col()

the most frequent words are ‘the’, ‘and’, ‘to’ and ‘of’.They dont really make sence in this analysis, they are called stop word in English, lets filter out the stop words by using anti_join functions

the ‘custom stopword’ is a customized stop word, it was not used in this analysis, that fro tidytext library was used

# Load custom stopwords
#custom_stop_words <- read_csv(r"(C:/Users/ebene/Desktop/text_NPL/stop_words_list.csv)", col_names = FALSE)

# Remove stop words
#tokens_speech <- tokens_speech %>% 
  #anti_join(custom_stop_words, by = c('word'='X1'))
library(tidytext)

#  Remove stop words
tokens_speech <- tokens_speech %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

tokens_speech %>% nrow()

## [1] 850

after removing stop words, the words are then ploted agaist their frequncy of occurance, which I reduced to 4, it will intrest you to note that nation, nigeria, poliicy are the words that top the chat, security and job at the middle. invistment is at the base of the chat and education did not make it.

tokens_speech %>% 
  group_by(word) %>% 
  summarize(count = n()) %>%
  arrange(desc(count)) %>% 
  filter(count >=4) %>% 
  mutate(token = reorder(word, count)) %>% 
  ggplot(aes(x=count, y=token)) +
  geom_col()

Stemming and Lemmatizing

To arrange the tokens_speech data frame by the word column and display rows 316 to 325. the second code, stemmed create a new collumn of word, with the stemmed versions of the words in the word column. It uses the SnowballC::wordStem()

# look at similar words
arrange(tokens_speech, word)[315:325, ]

## # A tibble: 11 × 2
##    word        order
##    <chr>       <int>
##  1 funds        1621
##  2 funds        1661
##  3 furtherance  1065
##  4 future        211
##  5 future        220
##  6 future       1911
##  7 gdp          1253
##  8 generation   1315
##  9 god            35
## 10 god           264
## 11 god          1984

#Stem the tokens
stemmed <- tokens_speech %>% mutate(stem = SnowballC::wordStem(word))

# look at similar words now
arrange(stemmed, word)[315:325, ]

## # A tibble: 11 × 3
##    word        order stem   
##    <chr>       <int> <chr>  
##  1 funds        1621 fund   
##  2 funds        1661 fund   
##  3 furtherance  1065 further
##  4 future        211 futur  
##  5 future        220 futur  
##  6 future       1911 futur  
##  7 gdp          1253 gdp    
##  8 generation   1315 gener  
##  9 god            35 god    
## 10 god           264 god    
## 11 god          1984 god

stemmed %>% 
  group_by(stem) %>% 
  summarize(count = n()) %>%
  arrange(desc(count)) %>% 
  filter(count > 4) %>% 
  mutate(token = reorder(stem, count)) %>% 
  ggplot(aes(x=count, y=token)) +
  geom_col()

Key words

set.seed(200)

stemmed %>% 
  group_by(word) %>% 
  summarize(count = n()) %>% 
  with(wordcloud(words=word, freq=count, min.freq=3, max.words=100, random.order=F, rot.per=0.30, colors=brewer.pal(8, "Dark2")))

Below is a brief description of the arguments used in the word cloud function; words – words to be plotted freq – frequencies of words min.freq – words whose frequency is at or above this threshold value is plotted (in this case, I have set it to 3) max.words – the maximum number of words to display on the plot (in the code above, I have set it 100) random.order – I have set it to FALSE, so the words are plotted in order of decreasing frequency rot.per – the percentage of words that are displayed as vertical text (with 90-degree rotation). I have set it 0.30 (30 %), please feel free to adjust this setting to suit your preferences colors – changes word colors going from lowest to highest frequencies

Sentiment total

# sentiment dictionary from tidytext use for sentiment analysis
library(tidytext)

lm_dict <- get_sentiments("nrc")
lm_dict %>% group_by(sentiment) %>% summarize(count = n())

## # A tibble: 10 × 2
##    sentiment    count
##    <chr>        <int>
##  1 anger         1245
##  2 anticipation   837
##  3 disgust       1056
##  4 fear          1474
##  5 joy            687
##  6 negative      3316
##  7 positive      2308
##  8 sadness       1187
##  9 surprise       532
## 10 trust         1230

the “NRC Emotion Lexicon” developed by Saif M. Mohammad and Peter D. Turney. is One popular sentiment lexicon for political analysis It includes sentiment annotations for various emotions and is commonly used in political sentiment analysis studies. You can obtain the “NRC Emotion Lexicon” using the get_sentiments() function from the tidytext package.

# Add sentiment
sentimented <- stemmed %>% 
  inner_join(lm_dict, by = 'word')

## Warning in inner_join(., lm_dict, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 8491 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Explore totals
sentimented %>% 
  group_by(sentiment) %>% 
  summarize(count = n(), percent = count/nrow(sentimented))

## # A tibble: 10 × 3
##    sentiment    count percent
##    <chr>        <int>   <dbl>
##  1 anger           23  0.0387
##  2 anticipation    58  0.0976
##  3 disgust         14  0.0236
##  4 fear            43  0.0724
##  5 joy             63  0.106 
##  6 negative        52  0.0875
##  7 positive       177  0.298 
##  8 sadness         13  0.0219
##  9 surprise        15  0.0253
## 10 trust          136  0.229

sentimented %>% 
  group_by(sentiment) %>% 
  summarize(count = n(), percent = count/nrow(sentimented)) %>% 
  ggplot(aes(x='', y=percent, fill=sentiment)) +
  geom_bar(width=1, stat='identity')

Analyzing President Bola Ahmed Tinubu’s 29 MAY 2023 Inaugural Address

ebenezer

2023-05-30