## Let’s first load in a single pdf file of budget speech available at http://www.finance.gov.pk/budget/budget_speech_english_2019_20.pdf
## install.packages(c("extrafont", "ggraph", "ggridges", "tidyverse", "tidytext", "forcats", "reshape2", "tidyr", "igraph", "widyr", "viridis"))
# load libraries ----
warning(FALSE)## Warning: FALSE
suppressPackageStartupMessages({
library(extrafont)
library(ggraph)
library(ggridges)
library(pdftools)
library(tidyverse)
library(tidytext)
library(forcats)
library(reshape2)
library(tidyr)
library(igraph)
library(widyr)
library(viridis)}
)## Warning: package 'extrafont' was built under R version 3.5.2
## Warning: package 'ggraph' was built under R version 3.5.3
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'ggridges' was built under R version 3.5.3
## Warning: package 'pdftools' was built under R version 3.5.3
## Warning: package 'tidyverse' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.3
## Warning: package 'tidytext' was built under R version 3.5.3
## Warning: package 'reshape2' was built under R version 3.5.3
## Warning: package 'igraph' was built under R version 3.5.3
## Warning: package 'widyr' was built under R version 3.5.3
## Warning: package 'viridis' was built under R version 3.5.3
## We’ll use pdftools to import the pdf file of budget Speech.
FBUDGP_import1 <- pdf_text("http://www.finance.gov.pk/budget/budget_speech_english_2019_20.pdf")
str(FBUDGP_import1)## chr [1:45] " Budget Speech 2019-20\r\n PART – I\r\n "| __truncated__ ...
### We’ve downloaded the pdf file as a a list of strings, one for each page (there are 45 pages in the Budget Speech).
### We’ve got a bunch of spaces and special characters \r and \n indicating linebreaks.
## We can deal with this by splitting on \r with strsplit() and removing \n with gsub(). I have to look up regular expression every time I use them.
FBUDGP_text_raw <-
data.frame(text=unlist(strsplit(FBUDGP_import1,"\r"))) %>%
mutate(Budget="2019",
line=row_number(),
text=gsub("\n","",text))
head(FBUDGP_text_raw)## text
## 1 Budget Speech 2019-20
## 2 PART – I
## 3 BismillahirRehmanir Raheem
## 4 Mr. Speaker,
## 5 1. I would like to start by thanking Almighty Allah, the most gracious,
## 6 the most merciful,as I present the first Annual Budget of the democratic
## Budget line
## 1 2019 1
## 2 2019 2
## 3 2019 3
## 4 2019 4
## 5 2019 5
## 6 2019 6
#Now we’re about ready to run the textual analysis!
# Text mining
### Now we can begin to apply the tidytext mining technqiues outlined in Text Mining with R. I took these data and walked pretty much step by step through the book and learned a lot. Let me share some highlights.
FBUDGP_text <-
FBUDGP_text_raw %>%
as_tibble() %>%
unnest_tokens(word,text)
FBUDGP_text## # A tibble: 11,286 x 3
## Budget line word
## <chr> <int> <chr>
## 1 2019 1 budget
## 2 2019 1 speech
## 3 2019 1 2019
## 4 2019 1 20
## 5 2019 2 part
## 6 2019 2 i
## 7 2019 3 bismillahirrehmanir
## 8 2019 3 raheem
## 9 2019 4 mr
## 10 2019 4 speaker
## # ... with 11,276 more rows
## # A tibble: 2,328 x 2
## word n
## <chr> <int>
## 1 of 511
## 2 the 487
## 3 to 422
## 4 and 279
## 5 in 235
## 6 is 235
## 7 tax 207
## 8 be 163
## 9 for 157
## 10 a 126
## # ... with 2,318 more rows
## We have a lot of common words like “the”,“of”,and “in”. In text mining these words are called “stop words”. We can remove them by using anti-join and the stop_words list that comes in tidytext package.
FBUDGP_text %>%
anti_join(stop_words)%>%
count(word, sort = TRUE) ## Joining, by = "word"
## # A tibble: 2,028 x 2
## word n
## <chr> <int>
## 1 tax 207
## 2 rs 119
## 3 proposed 101
## 4 billion 86
## 5 government 53
## 6 income 53
## 7 rate 48
## 8 sales 37
## 9 million 35
## 10 persons 35
## # ... with 2,018 more rows
## Getting better! But we also have some numbers in the text.
## Let’s drop numbers from the text. So we’ll take a heavy-handed approach and only keep alphabetic characters.
FBUDGP_text2 <-
FBUDGP_text %>%
mutate(word = gsub("[^A-Za-z ]","",word)) %>%
filter(word != "")
FBUDGP_text2 %>%
anti_join(stop_words)%>%
count(word, sort = TRUE) ## Joining, by = "word"
## # A tibble: 1,831 x 2
## word n
## <chr> <int>
## 1 tax 207
## 2 rs 120
## 3 proposed 101
## 4 billion 89
## 5 government 53
## 6 income 53
## 7 rate 48
## 8 sales 37
## 9 persons 36
## 10 million 35
## # ... with 1,821 more rows
### What’s the overall sentiment of the Budget? Text mining allows us to try to score text, or portions of text for sentiment. We can apply one of the sentiments datasets supplied by tidytext to score the Budget Speech. For right now we’ll use the bing library based on Bing Liu and collaborators
#Let’s see what the most frequently used negative and positive words are based on the bing lexicon.
FBUDGP_text2 %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)## Joining, by = "word"
## # A tibble: 221 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 debt negative 17
## 2 free positive 10
## 3 immovable negative 10
## 4 refund positive 10
## 5 relief positive 10
## 6 enhanced positive 8
## 7 available positive 7
## 8 difficult negative 7
## 9 exceeding positive 7
## 10 like positive 7
## # ... with 211 more rows
#So debt is a negative word is used 17 times in the speech. free, a positive word, is used 10. But hey! Wait a second.
## Debt is the most frequent word in the list, considered negative. But in a economic report debt might be more descriptive than positive/negative.
## Also, “gross” is probably associated with “Gross Domestic Product” rather than expressions of disgust. Let’s investigate.
# exploring with bigrams
# We can apply tidytext principles to single words, like above. But we can also apply them to consecutive sequences of words, called n-grams. Two words together are called bigrams.
FBUDGP_bigrams <-
FBUDGP_text_raw %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
as_tibble()
FBUDGP_bigrams## # A tibble: 10,144 x 3
## Budget line bigram
## <chr> <int> <chr>
## 1 2019 1 budget speech
## 2 2019 1 speech 2019
## 3 2019 1 2019 20
## 4 2019 2 part i
## 5 2019 3 bismillahirrehmanir raheem
## 6 2019 4 mr speaker
## 7 2019 5 1 i
## 8 2019 5 i would
## 9 2019 5 would like
## 10 2019 5 like to
## # ... with 10,134 more rows
## # A tibble: 6,858 x 2
## bigram n
## <chr> <int>
## 1 <NA> 129
## 2 it is 59
## 3 is proposed 54
## 4 proposed to 50
## 5 to be 48
## 6 of the 44
## 7 in the 43
## 8 sales tax 34
## 9 the tax 32
## 10 proposed that 30
## # ... with 6,848 more rows
## As Silge and Robinson point out, many of these bigrams are uninteresing. Let’s filter out uninteresing bigrams that contain stop words.
bigrams_separated <- FBUDGP_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts## # A tibble: 1,772 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 <NA> <NA> 129
## 2 sales tax 34
## 3 2019 20 17
## 4 tax regime 14
## 5 tax credit 13
## 6 tax rates 11
## 7 income tax 9
## 8 tax rate 9
## 9 withholding tax 9
## 10 raw materials 8
## # ... with 1,762 more rows
## unite them
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united## # A tibble: 2,412 x 3
## Budget line bigram
## <chr> <int> <chr>
## 1 2019 1 budget speech
## 2 2019 1 speech 2019
## 3 2019 1 2019 20
## 4 2019 3 bismillahirrehmanir raheem
## 5 2019 5 thanking almighty
## 6 2019 5 almighty allah
## 7 2019 6 annual budget
## 8 2019 8 minister imran
## 9 2019 8 imran khan.tehreek
## 10 2019 8 insaaf brings
## # ... with 2,402 more rows
#Now let’s find out if the speech about gross or GDP most of the time when they used the word “gross”.
bigrams_filtered %>%
filter(word1 == "gross") %>%
count( word2, sort = TRUE)## # A tibble: 2 x 2
## word2 n
## <chr> <int>
## 1 amount 2
## 2 federal 1
## Yes, we’ll probably want to drop terms like “gross” from the sentiment score.
## Revised sentiment
## I analyzed the speech word frequencies and came up with a list of words that probably aren’t negative in the usual sense.
### There’s another lexicon “loughran” that’s more tuned to financials.
### Instead I took the bing list and added some custom words. We can bind a list of custom words to our stop_words dataset and filter. Following Silge and Robinson we can use the %/% operator to break the text up into 80 line sections (about 3 pages of text).
custom_stop_words2 <-
bind_rows(data_frame(word = c("debt",
"gross",
"crude",
"well",
"maturity",
"work",
"marginally",
"leverage"),
lexicon = c("custom")),
stop_words)## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
FBUDGP_sentiment <-
FBUDGP_text %>%
anti_join(custom_stop_words2) %>%
inner_join(get_sentiments("bing")) %>%
count(Budget, index = line %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
## Joining, by = "word"