Text Analysis on Social Indicators

Author

Burak Demirtas

Indicators Analysis

This analysis is about the word counts for social indicators defined in the given data.

Hide / show the code
# Loading the libraries-----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
p_load(pacman, readxl, RColorBrewer, tidytext, tidyverse, textdata, wordcloud)
# Setting the Working directory---------------------------------------------
WorkingDirectory <- getwd()
if (!is.null(WorkingDirectory)) setwd(WorkingDirectory)
# Loading the Data----------------------------------------------------------
dataset.df <- read_excel("indicators.xlsx")
#--------------------------------------------------------------------------

Original data whicih I refined in Excel looks like this:

Hide / show the code
glimpse(dataset.df)
Rows: 248
Columns: 6
$ Goals       <chr> "Goal 1. End poverty in all its forms everywhere", "Goal 1…
$ Goals_Short <chr> "End Poverty", "End Poverty", "End Poverty", "End Poverty"…
$ Item_lvl1   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2…
$ Item_lvl2   <chr> "1", "2", "2", "3", "4", "4", "5", "5", "5", "5", "a", "a"…
$ Item_lvl3   <dbl> 1, 1, 2, 1, 1, 2, 1, 2, 3, 4, 1, 2, 1, 1, 2, 1, 2, 3, 1, 2…
$ Indicators  <chr> "Proportion of the population living below the internation…

Tokenizing

Each word tokenized and other related data is populated in here:

Hide / show the code
# Tokenizing the data by the text column
tokenized_data <- dataset.df %>%
          unnest_tokens(word, Indicators)
glimpse(tokenized_data)
Rows: 3,941
Columns: 6
$ Goals       <chr> "Goal 1. End poverty in all its forms everywhere", "Goal 1…
$ Goals_Short <chr> "End Poverty", "End Poverty", "End Poverty", "End Poverty"…
$ Item_lvl1   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ Item_lvl2   <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"…
$ Item_lvl3   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ word        <chr> "proportion", "of", "the", "population", "living", "below"…

Defining Custom Stop Words

Meaningless words defined in a custom list in here.

Hide / show the code
custom_stop_words <- tribble(
  # Column names begins with ~ in the tribbles
  ~word, ~lexicon,
  # Add whatever words you want to add to the initial dictionary
  "ii", "CUSTOM",
  "ics", "CUSTOM",
  "ihi", "CUSTOM",
  "iii", "CUSTOM"
)

# Bind the custom stop words to stop_words
stop_words_customized <- stop_words %>% 
  rbind(custom_stop_words)

Cleaning the Words and Removing Stop Words

Data cleaned and words counted. Here are the top words:

Hide / show the code
tokenized_data_cleaned <- 
  tokenized_data %>% 
  anti_join(stop_words_customized) %>% 
  mutate(word = str_replace_all(word, "[^[:alnum:]' ]+", "")) %>%
  mutate(word = str_replace_all(word, "'s\\b", "")) %>%
  filter(!grepl("\\d", word))

# Getting the counts
tokenized_counts <- tokenized_data_cleaned %>% count(word) %>% arrange(desc(n))

# Show top n words in column chart
topWords <- tokenized_counts %>% slice_max(n, n = 20)

ggplot(topWords, aes(y= fct_reorder(word,n), x = n)) + 
  geom_col() + theme_minimal() +
  labs(y = "Words", x = "Counts", title = "Top Words") 

Creating Word Cloud

Word cloud is made for top 50 – 70 – 100 words in 3 graphs.

Hide / show the code
# Define your color palette from RColorBrewer
# Select a palette from RColorBrewer, e.g., "Set2" with 8 colors
my_palette <- brewer.pal(8, "Dark2") 
# Define a seed to freeze things
set.seed(42)
Hide / show the code
# Create the word cloud
wordcloud(
  words = tokenized_counts$word, # Select the column containing the words
  freq = tokenized_counts$n, # Select the column containing the word frequencies or counts
  max.words = 50, # Maximum number of words to show on the graph
  colors = my_palette # Use your custom color palette
)

Hide / show the code
# Create the word cloud
wordcloud(
  words = tokenized_counts$word, # Select the column containing the words
  freq = tokenized_counts$n, # Select the column containing the word frequencies or counts
  max.words = 70, # Maximum number of words to show on the graph
  colors = my_palette # Use your custom color palette
)

Hide / show the code
# Create the word cloud
wordcloud(
  words = tokenized_counts$word, # Select the column containing the words
  freq = tokenized_counts$n, # Select the column containing the word frequencies or counts
  max.words = 100, # Maximum number of words to show on the graph
  colors = my_palette # Use your custom color palette
)

Sentiment Analysis

Also made a sentiment analysis and matched the emotions related to these words, according to NRC dictionary:

Hide / show the code
sentimented_data <- tokenized_counts %>% 
  inner_join(get_sentiments("nrc"), by = join_by(word))

sentiment_counts <- sentimented_data %>% count(sentiment) %>% arrange(desc(n))
sentiment_counts
# A tibble: 10 × 2
   sentiment        n
   <chr>        <int>
 1 positive       121
 2 trust           75
 3 negative        70
 4 fear            46
 5 sadness         41
 6 anticipation    40
 7 anger           33
 8 joy             29
 9 disgust         26
10 surprise        11

Word Cloud of Sentiments

Here is the result for 10 emotions listed in this dictionary:

Hide / show the code
my_palette3 <- "#2FA3EBB1" #brewer.pal(3, "Set1") 
wordcloud(words = sentiment_counts$sentiment,
          freq = sentiment_counts$n,
          colors = my_palette3)

Emotion-Based Word Counts with NRC Dictionary

Finally, here are the top words based on their emotional link for 2 emotion categories as "positive" & "negative".

Hide / show the code
sentimented_counts <- sentimented_data %>%
  # Filtering the desired sentiments
  filter(sentiment %in% c("positive", "negative")) %>%
  # Grouping by sentiment to find after top 3 words for each sentiment
  group_by(sentiment) %>% 
  # Getting the top 3 sentiment
  slice_max(n, n=10) %>% 
  # Clear grouping
  ungroup() %>% 
  # Create a factor called word_factor that has each word ordered by the count
  mutate(word_factor = fct_reorder(word,n))

print(sentimented_counts, n=30)
# A tibble: 22 × 4
   word            n sentiment word_factor
   <chr>       <int> <chr>     <fct>      
 1 risk           13 negative  risk       
 2 disaster       12 negative  disaster   
 3 mortality       8 negative  mortality  
 4 violence        8 negative  violence   
 5 income          7 negative  income     
 6 government      6 negative  government 
 7 waste           6 negative  waste      
 8 bribe           4 negative  bribe      
 9 expenditure     4 negative  expenditure
10 gross           4 negative  gross      
11 poverty         4 negative  poverty    
12 population     45 positive  population 
13 sex            30 positive  sex        
14 public         16 positive  public     
15 assistance     11 positive  assistance 
16 land           10 positive  land       
17 food            7 positive  food       
18 growth          7 positive  growth     
19 income          7 positive  income     
20 share           7 positive  share      
21 status          7 positive  status     
22 united          7 positive  united     
Hide / show the code
my_palette2 <- brewer.pal(8, "Set1") 

ggplot(sentimented_counts, aes(x = word_factor, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales ="free") +
  coord_flip() +
  labs(title ="Emotion-Based Word Counts",x ="Words") +
  # Let's also apply our nice colors which we used in word cloud!
  scale_fill_manual(values = my_palette2)