library(pdftools)
library(readtext)
library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
##
## meta, meta<-
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(xtable)
library(DT)
library(webshot)
setwd("~/Google Drive File Stream/My Drive/R/Projects/Work package 2")
#rm(list=ls())
pdf_directory <- paste0(getwd(), "/PDF")
txt_directory <- paste0(getwd(), "/Texts")
#make two folders: one in which store .pdf files - called PDF - and another new and empty folder in which we will store our .txt files. Call this one Texts. Move all the .pdf files from Moodle to the PDF folder and tell R what we did
text_df <- readtext(paste0(txt_directory, "*"),
encoding = "UTF-8",
docvarsfrom = "filenames",
docvarnames = c("num", "type", "company", "year"),
dvsep = "_")
text_corpus <- corpus(text_df)
summary(text_corpus)
## Corpus consisting of 39 documents, showing 39 documents:
##
## Text Types Tokens Sentences num
## 054_AnnualReport_HuskyEnergy_2019.txt 6996 88257 2349 54
## 055_AnnualReport_HuskyEnergy_2018.txt 6756 88275 2368 55
## 056_AnnualReport_HuskyEnergy_2017.txt 6901 89446 2118 56
## 057_AnnualReport_HuskyEnergy_2016.txt 6927 84807 1949 57
## 058_AnnualReport_HuskyEnergy_2015.txt 6538 74582 1738 58
## 071_AnnualReport_HuskyEnergy_2013.txt 6731 75466 1868 71
## 072_AnnualReport_HuskyEnergy_2012.txt 6848 82056 2046 72
## 073_AnnualReport_HuskyEnergy_2010.txt 6790 66654 1895 73
## 073_AnnualReport_HuskyEnergy_2011.txt 7732 74912 1900 73
## 075_AnnualReport_CenovusEnergy_2020.txt 7023 89771 2237 75
## 076_AnnualReport_CenovusEnergy_2019.txt 7117 89565 2219 76
## 077_AnnualReport_CenovusEnergy_2018.txt 7240 91739 2317 77
## 078_AnnualReport_CenovusEnergy_2017.txt 6584 69431 1837 78
## 079_AnnualReport_CenovusEnergy_2016.txt 6807 66181 1728 79
## 080_AnnualReport_CenovusEnergy_2015.txt 7235 80427 1992 80
## 081_AnnualReport_CenovusEnergy_2014.txt 7233 81619 2070 81
## 082_AnnualReport_CenovusEnergy_2013.txt 7616 86863 1790 82
## 083_AnnualReport_CenovusEnergy_2012.txt 7730 97091 1723 83
## 084_AnnualReport_CenovusEnergy_2011.txt 9860 90597 1478 84
## 194_AnnualReport_IndustrialAlliance_2010.txt 8423 103862 2575 194
## 195_AnnualReport_IndustrialAlliance_2011.txt 8866 110223 2604 195
## 196_AnnualReport_IndustrialAlliance_2012.txt 8201 91466 2226 196
## 197_AnnualReport_IndustrialAlliance_2013.txt 8210 96588 2406 197
## 198_AnnualReport_IndustrialAlliance_2014.txt 7873 88822 2113 198
## 199_AnnualReport_IndustrialAlliance_2015.txt 8359 88368 1953 199
## 200_AnnualReport_IndustrialAlliance_2016.txt 8012 88846 1980 200
## 201_AnnualReport_IndustrialAlliance_2017.txt 7054 79247 1911 201
## 202_AnnualReport_IndustrialAlliance_2018.txt 7636 85051 1949 202
## 203_AnnualReport_IndustrialAlliance_2019.txt 8140 81083 1208 203
## 247_AnnualReport_Shell_2020.txt 16210 234186 5628 247
## 248_AnnualReport_Shell_2019.txt 15471 223496 5373 248
## 249_AnnualReport_Shell_2018.txt 15021 208112 4878 249
## 250_AnnualReport_Shell_2017.txt 14651 192587 4596 250
## 251_AnnualReport_Shell_2016.txt 13091 165700 4115 251
## 252_AnnualReport_Shell_2015.txt 12071 143206 3349 252
## 253_AnnualReport_Shell_2014.txt 11754 138184 3242 253
## 254_AnnualReport_Shell_2013.txt 11513 130806 3014 254
## 255_AnnualReport_Shell_2012.txt 11669 131995 3077 255
## 256_AnnualReport_Shell_2011.txt 11420 130451 3044 256
## type company year
## AnnualReport HuskyEnergy 2019
## AnnualReport HuskyEnergy 2018
## AnnualReport HuskyEnergy 2017
## AnnualReport HuskyEnergy 2016
## AnnualReport HuskyEnergy 2015
## AnnualReport HuskyEnergy 2013
## AnnualReport HuskyEnergy 2012
## AnnualReport HuskyEnergy 2010
## AnnualReport HuskyEnergy 2011
## AnnualReport CenovusEnergy 2020
## AnnualReport CenovusEnergy 2019
## AnnualReport CenovusEnergy 2018
## AnnualReport CenovusEnergy 2017
## AnnualReport CenovusEnergy 2016
## AnnualReport CenovusEnergy 2015
## AnnualReport CenovusEnergy 2014
## AnnualReport CenovusEnergy 2013
## AnnualReport CenovusEnergy 2012
## AnnualReport CenovusEnergy 2011
## AnnualReport IndustrialAlliance 2010
## AnnualReport IndustrialAlliance 2011
## AnnualReport IndustrialAlliance 2012
## AnnualReport IndustrialAlliance 2013
## AnnualReport IndustrialAlliance 2014
## AnnualReport IndustrialAlliance 2015
## AnnualReport IndustrialAlliance 2016
## AnnualReport IndustrialAlliance 2017
## AnnualReport IndustrialAlliance 2018
## AnnualReport IndustrialAlliance 2019
## AnnualReport Shell 2020
## AnnualReport Shell 2019
## AnnualReport Shell 2018
## AnnualReport Shell 2017
## AnnualReport Shell 2016
## AnnualReport Shell 2015
## AnnualReport Shell 2014
## AnnualReport Shell 2013
## AnnualReport Shell 2012
## AnnualReport Shell 2011
data_texts_dfm <- dfm(text_corpus,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
split_hyphens = TRUE,
stem=T)
data_texts_dfm <- dfm_tolower(data_texts_dfm, keep_acronyms = FALSE)
data_texts_dfm
## Document-feature matrix of: 39 documents, 20,022 features (84.6% sparse) and 4 docvars.
## features
## docs energi driven annual report corpor
## 054_AnnualReport_HuskyEnergy_2019.txt 335 7 56 100 52
## 055_AnnualReport_HuskyEnergy_2018.txt 314 5 64 97 43
## 056_AnnualReport_HuskyEnergy_2017.txt 65 8 57 86 39
## 057_AnnualReport_HuskyEnergy_2016.txt 73 5 53 86 40
## 058_AnnualReport_HuskyEnergy_2015.txt 57 1 36 90 41
## 071_AnnualReport_HuskyEnergy_2013.txt 118 1 104 150 42
## features
## docs profil huski integr compani base
## 054_AnnualReport_HuskyEnergy_2019.txt 1 459 52 841 151
## 055_AnnualReport_HuskyEnergy_2018.txt 1 434 44 861 142
## 056_AnnualReport_HuskyEnergy_2017.txt 6 212 30 925 142
## 057_AnnualReport_HuskyEnergy_2016.txt 4 262 24 788 149
## 058_AnnualReport_HuskyEnergy_2015.txt 2 196 23 664 135
## 071_AnnualReport_HuskyEnergy_2013.txt 2 331 29 647 143
## [ reached max_ndoc ... 33 more documents, reached max_nfeat ... 20,012 more features ]
kwic_text <- kwic(text_corpus, c("risk", "climate"), window = 3,
valuetype = "glob", case_insensitive = T)
head(kwic_text, n = 10)
##
## [054_AnnualReport_HuskyEnergy_2019.txt, 6211] Standards and 5.0 | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 6213] 5.0 Risk and | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 6223] 57 5.1 Enterprise | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 6232] 59 5.2 Significant | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 11383] the impacts of | climate |
## [054_AnnualReport_HuskyEnergy_2019.txt, 11391] safety, enterprise | risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 11600] a discussion on | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 11602] on Risk and | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 21255] Inc. 5.0 | Risk |
## [054_AnnualReport_HuskyEnergy_2019.txt, 21257] 5.0 Risk and | Risk |
##
## and Risk Management
## Management 43 Changes
## Management 43 9.0
## Factors 43 9.1
## change, health
## management, resource
## and Risk Management
## Management, see
## and Risk Management
## Management 5.1 Enterprise
View(kwic_text)
#View(data_suncor_dfm)
#This can be practical if we want to know whether a word like risk comes in the combination of climate.
#I run the keyword-in-context command in R. I specify two words (Risk and climate) and tell R to show us the three words to either sides of it. Moreover, I take valuetype = "globe" to indicate that all version of word should be looked for, and lastly specify case\_insensitive=T to ensure it looks only for letters regardless of cases.
half2text_corpus <- corpus_subset(text_corpus, year >= 2015)
half1text_corpus <- corpus_subset(text_corpus, year < 2015)
kwic_absoluterisk2010 <- textplot_xray(
kwic(half1text_corpus, pattern = "risk"),
kwic(half1text_corpus, pattern = "climate"), scale = "absolute")
kwic_absoluterisk2015 <- textplot_xray(
kwic(half2text_corpus, pattern = "risk"),
kwic(half2text_corpus, pattern = "climate"), scale = "absolute")
kwic_relativerisk2010 <- textplot_xray(
kwic(half1text_corpus, pattern = "risk"),
kwic(half1text_corpus, pattern = "climate"), scale = "relative")
kwic_relativerisk2015 <- textplot_xray(
kwic(half2text_corpus, pattern = "risk"),
kwic(half2text_corpus, pattern = "climate"), scale = "relative")
kwic_absoluterisk2010 + aes(color = keyword) +
scale_color_manual(values = c("red", "green")) +
theme(legend.position = "none")
kwic_absoluterisk2015 + aes(color = keyword) +
scale_color_manual(values = c("red", "green")) +
theme(legend.position = "none")
kwic_relativerisk2010 + aes(color = keyword) +
scale_color_manual(values = c("red", "green")) +
theme(legend.position = "none")
kwic_relativerisk2015 + aes(color = keyword) +
scale_color_manual(values = c("red", "green")) +
theme(legend.position = "none")
features <- topfeatures(data_texts_dfm, 50)
features_plot <- data.frame(list(term = names(features),frequency = unname(features)))
features_plot$term <- with(features_plot, reorder(term, -frequency))
#which I then sort by decreasing frequency order (this is optional, if you do not specify this, the graph will be sorted on the alphabetical order of the words on the x-axis instead).
ggplot(features_plot) + geom_point(aes(x=term, y=frequency)) +
theme(axis.text.x=element_text(angle=90, hjust=1))
wordcloud_dfm_trim <- dfm_trim(data_texts_dfm, min_termfreq = 40)
textplot_wordcloud(wordcloud_dfm_trim)
head(docvars(text_corpus))
## num type company year
## 1 54 AnnualReport HuskyEnergy 2019
## 2 55 AnnualReport HuskyEnergy 2018
## 3 56 AnnualReport HuskyEnergy 2017
## 4 57 AnnualReport HuskyEnergy 2016
## 5 58 AnnualReport HuskyEnergy 2015
## 6 71 AnnualReport HuskyEnergy 2013
wordcloud_dfm_comp <- dfm(text_corpus, groups = "company",
remove = stopwords("english"), remove_punct = TRUE)
textplot_wordcloud(dfm_trim(wordcloud_dfm_comp,
min_termfreq = 2000,
max_words = 5000),
comparison = TRUE,
random_color=F,
color = col,
labelcolor=T)
#Dictionary approach
dic_list <- list(risk = c("uncertain*", "risk*", "concern*"),
climate = c("climat*", "environment*"),
government = c("tax*", "regulat*", "legislat*"),
country = "canada")
dic_created <- dictionary(dic_list, tolower = FALSE)
dic_created
## Dictionary object with 4 key entries.
## - [risk]:
## - uncertain*, risk*, concern*
## - [climate]:
## - climat*, environment*
## - [government]:
## - tax*, regulat*, legislat*
## - [country]:
## - canada
dictionary_results <- dfm_lookup(data_texts_dfm, dic_created)
dictionary_results
## Document-feature matrix of: 39 documents, 4 features (0.0% sparse) and 4 docvars.
## features
## docs risk climate government country
## 054_AnnualReport_HuskyEnergy_2019.txt 176 53 236 120
## 055_AnnualReport_HuskyEnergy_2018.txt 190 53 250 135
## 056_AnnualReport_HuskyEnergy_2017.txt 180 40 251 171
## 057_AnnualReport_HuskyEnergy_2016.txt 177 41 241 157
## 058_AnnualReport_HuskyEnergy_2015.txt 167 37 197 129
## 071_AnnualReport_HuskyEnergy_2013.txt 176 31 185 118
## [ reached max_ndoc ... 33 more documents ]
require(Matrix)
## Loading required package: Matrix
textdistmat <- dist(as.matrix(dfm_weight(wordcloud_dfm_trim, scheme = "prop")))
presCluster <- hclust(textdistmat)
# label with document names
presCluster$labels <- docnames(wordcloud_dfm_trim)
# plot as a dendrogram
plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency", hang = -1, cex = 0.4)
library(quanteda.textmodels)
#wordfishdfm@Dimnames$docs
wordfishdfm <- dfm_trim(data_texts_dfm, min_termfreq = 1000) #only choose docs with termfreq=1000
wordfish_texts <- textmodel_wordfish(wordfishdfm,
dir = c(9, 30),
dispersion = "poisson")
summary(wordfish_texts)
##
## Call:
## textmodel_wordfish.dfm(x = wordfishdfm, dir = c(9, 30), dispersion = "poisson")
##
## Estimated Document Positions:
## theta se
## 054_AnnualReport_HuskyEnergy_2019.txt 0.3117 0.0013157
## 055_AnnualReport_HuskyEnergy_2018.txt 0.3074 0.0013646
## 056_AnnualReport_HuskyEnergy_2017.txt 0.2991 0.0014858
## 057_AnnualReport_HuskyEnergy_2016.txt 0.3068 0.0014214
## 058_AnnualReport_HuskyEnergy_2015.txt 0.3011 0.0016023
## 071_AnnualReport_HuskyEnergy_2013.txt 0.2911 0.0017500
## 072_AnnualReport_HuskyEnergy_2012.txt 0.2934 0.0016520
## 073_AnnualReport_HuskyEnergy_2010.txt 0.3276 0.0013425
## 073_AnnualReport_HuskyEnergy_2011.txt 0.2234 0.0031551
## 075_AnnualReport_CenovusEnergy_2020.txt 0.2942 0.0015541
## 076_AnnualReport_CenovusEnergy_2019.txt 0.2898 0.0016174
## 077_AnnualReport_CenovusEnergy_2018.txt 0.2933 0.0015611
## 078_AnnualReport_CenovusEnergy_2017.txt 0.2985 0.0017058
## 079_AnnualReport_CenovusEnergy_2016.txt 0.2882 0.0019289
## 080_AnnualReport_CenovusEnergy_2015.txt 0.3230 0.0012228
## 081_AnnualReport_CenovusEnergy_2014.txt 0.3084 0.0014357
## 082_AnnualReport_CenovusEnergy_2013.txt 0.3277 0.0011420
## 083_AnnualReport_CenovusEnergy_2012.txt 0.3206 0.0011254
## 084_AnnualReport_CenovusEnergy_2011.txt 0.3266 0.0012397
## 194_AnnualReport_IndustrialAlliance_2010.txt -0.5458 0.0099025
## 195_AnnualReport_IndustrialAlliance_2011.txt -0.5928 0.0093776
## 196_AnnualReport_IndustrialAlliance_2012.txt -0.6023 0.0102511
## 197_AnnualReport_IndustrialAlliance_2013.txt -0.5770 0.0099597
## 198_AnnualReport_IndustrialAlliance_2014.txt -0.6001 0.0103286
## 199_AnnualReport_IndustrialAlliance_2015.txt -0.6019 0.0104904
## 200_AnnualReport_IndustrialAlliance_2016.txt -0.5985 0.0103517
## 201_AnnualReport_IndustrialAlliance_2017.txt -0.5783 0.0108405
## 202_AnnualReport_IndustrialAlliance_2018.txt -0.5485 0.0107309
## 203_AnnualReport_IndustrialAlliance_2019.txt -5.5363 0.0027482
## 247_AnnualReport_Shell_2020.txt 0.5068 0.0001504
## 248_AnnualReport_Shell_2019.txt 0.5080 0.0001521
## 249_AnnualReport_Shell_2018.txt 0.5115 0.0001528
## 250_AnnualReport_Shell_2017.txt 0.5128 0.0001580
## 251_AnnualReport_Shell_2016.txt 0.5158 0.0001625
## 252_AnnualReport_Shell_2015.txt 0.4998 0.0001932
## 253_AnnualReport_Shell_2014.txt 0.4984 0.0001985
## 254_AnnualReport_Shell_2013.txt 0.4991 0.0002020
## 255_AnnualReport_Shell_2012.txt 0.4983 0.0002010
## 256_AnnualReport_Shell_2011.txt 0.4990 0.0002013
##
## Estimated Feature Scores:
## energi annual report corpor huski integr compani base alberta
## beta 2.012 0.05727 0.276 -0.02587 0.03564 1.091 -0.09401 0.3395 1.037
## psi 4.216 5.33708 5.740 4.73908 4.40965 3.552 6.47200 5.0018 3.402
## common share public trade stock exchang oper canada unit
## beta -0.2241 0.2056 0.06821 0.9426 -0.1984 0.302 0.3599 -0.001078 0.01828
## psi 4.4872 6.0490 3.30976 3.9669 4.3444 4.609 5.8816 4.534646 4.29780
## state asia region upstream downstream busi segment two main
## beta 0.2258 4.907 0.07797 3.485 3.275 -0.07162 0.307 0.1045 -0.1091
## psi 3.6020 2.387 3.41652 3.368 2.929 5.16800 3.994 3.6984 3.6436
## area focus includ
## beta 1.690 0.1312 0.2068
## psi 3.221 3.4076 5.6247
textplot_scale1d(wordfish_texts)
textplot_scale1d(wordfish_texts, groups = wordcloud_dfm_trim$company)
textplot_scale1d(wordfish_texts, margin = "features",
highlighted = c("climat", "risk"))
textplot_scale1d(wordfish_texts, margin = "documents")
predict(wordfish_texts, interval = "confidence")
## $fit
## fit lwr upr
## 054_AnnualReport_HuskyEnergy_2019.txt 0.3116844 0.3091058 0.3142631
## 055_AnnualReport_HuskyEnergy_2018.txt 0.3073945 0.3047200 0.3100690
## 056_AnnualReport_HuskyEnergy_2017.txt 0.2991271 0.2962149 0.3020392
## 057_AnnualReport_HuskyEnergy_2016.txt 0.3068291 0.3040431 0.3096150
## 058_AnnualReport_HuskyEnergy_2015.txt 0.3010559 0.2979155 0.3041963
## 071_AnnualReport_HuskyEnergy_2013.txt 0.2910691 0.2876392 0.2944990
## 072_AnnualReport_HuskyEnergy_2012.txt 0.2934328 0.2901948 0.2966707
## 073_AnnualReport_HuskyEnergy_2010.txt 0.3276491 0.3250178 0.3302804
## 073_AnnualReport_HuskyEnergy_2011.txt 0.2234487 0.2172648 0.2296325
## 075_AnnualReport_CenovusEnergy_2020.txt 0.2942164 0.2911704 0.2972624
## 076_AnnualReport_CenovusEnergy_2019.txt 0.2897672 0.2865971 0.2929372
## 077_AnnualReport_CenovusEnergy_2018.txt 0.2933042 0.2902445 0.2963640
## 078_AnnualReport_CenovusEnergy_2017.txt 0.2984973 0.2951541 0.3018406
## 079_AnnualReport_CenovusEnergy_2016.txt 0.2882101 0.2844296 0.2919906
## 080_AnnualReport_CenovusEnergy_2015.txt 0.3230175 0.3206210 0.3254140
## 081_AnnualReport_CenovusEnergy_2014.txt 0.3083649 0.3055509 0.3111789
## 082_AnnualReport_CenovusEnergy_2013.txt 0.3277134 0.3254751 0.3299517
## 083_AnnualReport_CenovusEnergy_2012.txt 0.3205909 0.3183852 0.3227965
## 084_AnnualReport_CenovusEnergy_2011.txt 0.3266069 0.3241771 0.3290367
## 194_AnnualReport_IndustrialAlliance_2010.txt -0.5458133 -0.5652217 -0.5264048
## 195_AnnualReport_IndustrialAlliance_2011.txt -0.5928042 -0.6111840 -0.5744245
## 196_AnnualReport_IndustrialAlliance_2012.txt -0.6022579 -0.6223498 -0.5821661
## 197_AnnualReport_IndustrialAlliance_2013.txt -0.5769753 -0.5964960 -0.5574546
## 198_AnnualReport_IndustrialAlliance_2014.txt -0.6001356 -0.6203793 -0.5798919
## 199_AnnualReport_IndustrialAlliance_2015.txt -0.6018909 -0.6224517 -0.5813301
## 200_AnnualReport_IndustrialAlliance_2016.txt -0.5984882 -0.6187772 -0.5781991
## 201_AnnualReport_IndustrialAlliance_2017.txt -0.5782648 -0.5995118 -0.5570178
## 202_AnnualReport_IndustrialAlliance_2018.txt -0.5485127 -0.5695449 -0.5274806
## 203_AnnualReport_IndustrialAlliance_2019.txt -5.5362530 -5.5416393 -5.5308668
## 247_AnnualReport_Shell_2020.txt 0.5067629 0.5064681 0.5070577
## 248_AnnualReport_Shell_2019.txt 0.5080365 0.5077383 0.5083347
## 249_AnnualReport_Shell_2018.txt 0.5115178 0.5112182 0.5118173
## 250_AnnualReport_Shell_2017.txt 0.5127528 0.5124431 0.5130626
## 251_AnnualReport_Shell_2016.txt 0.5158130 0.5154944 0.5161315
## 252_AnnualReport_Shell_2015.txt 0.4997502 0.4993714 0.5001289
## 253_AnnualReport_Shell_2014.txt 0.4984099 0.4980208 0.4987990
## 254_AnnualReport_Shell_2013.txt 0.4990824 0.4986864 0.4994783
## 255_AnnualReport_Shell_2012.txt 0.4983240 0.4979301 0.4987180
## 256_AnnualReport_Shell_2011.txt 0.4989671 0.4985725 0.4993617