library(pdftools)
library(readtext)
library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
## 
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(xtable)
library(DT)
library(webshot)
setwd("~/Google Drive File Stream/My Drive/R/Projects/Work package 2")
#rm(list=ls())
pdf_directory <- paste0(getwd(), "/PDF")
txt_directory <- paste0(getwd(), "/Texts")

#make two folders: one in which store .pdf files - called PDF - and another new and empty folder in which we will store our .txt files. Call this one Texts. Move all the .pdf files from Moodle to the PDF folder and tell R what we did
text_df <- readtext(paste0(txt_directory, "*"), 
                    encoding = "UTF-8",
                    docvarsfrom = "filenames", 
                    docvarnames = c("num", "type", "company", "year"),
                    dvsep = "_")
text_corpus <- corpus(text_df)
summary(text_corpus)
## Corpus consisting of 39 documents, showing 39 documents:
## 
##                                          Text Types Tokens Sentences num
##         054_AnnualReport_HuskyEnergy_2019.txt  6996  88257      2349  54
##         055_AnnualReport_HuskyEnergy_2018.txt  6756  88275      2368  55
##         056_AnnualReport_HuskyEnergy_2017.txt  6901  89446      2118  56
##         057_AnnualReport_HuskyEnergy_2016.txt  6927  84807      1949  57
##         058_AnnualReport_HuskyEnergy_2015.txt  6538  74582      1738  58
##         071_AnnualReport_HuskyEnergy_2013.txt  6731  75466      1868  71
##         072_AnnualReport_HuskyEnergy_2012.txt  6848  82056      2046  72
##         073_AnnualReport_HuskyEnergy_2010.txt  6790  66654      1895  73
##         073_AnnualReport_HuskyEnergy_2011.txt  7732  74912      1900  73
##       075_AnnualReport_CenovusEnergy_2020.txt  7023  89771      2237  75
##       076_AnnualReport_CenovusEnergy_2019.txt  7117  89565      2219  76
##       077_AnnualReport_CenovusEnergy_2018.txt  7240  91739      2317  77
##       078_AnnualReport_CenovusEnergy_2017.txt  6584  69431      1837  78
##       079_AnnualReport_CenovusEnergy_2016.txt  6807  66181      1728  79
##       080_AnnualReport_CenovusEnergy_2015.txt  7235  80427      1992  80
##       081_AnnualReport_CenovusEnergy_2014.txt  7233  81619      2070  81
##       082_AnnualReport_CenovusEnergy_2013.txt  7616  86863      1790  82
##       083_AnnualReport_CenovusEnergy_2012.txt  7730  97091      1723  83
##       084_AnnualReport_CenovusEnergy_2011.txt  9860  90597      1478  84
##  194_AnnualReport_IndustrialAlliance_2010.txt  8423 103862      2575 194
##  195_AnnualReport_IndustrialAlliance_2011.txt  8866 110223      2604 195
##  196_AnnualReport_IndustrialAlliance_2012.txt  8201  91466      2226 196
##  197_AnnualReport_IndustrialAlliance_2013.txt  8210  96588      2406 197
##  198_AnnualReport_IndustrialAlliance_2014.txt  7873  88822      2113 198
##  199_AnnualReport_IndustrialAlliance_2015.txt  8359  88368      1953 199
##  200_AnnualReport_IndustrialAlliance_2016.txt  8012  88846      1980 200
##  201_AnnualReport_IndustrialAlliance_2017.txt  7054  79247      1911 201
##  202_AnnualReport_IndustrialAlliance_2018.txt  7636  85051      1949 202
##  203_AnnualReport_IndustrialAlliance_2019.txt  8140  81083      1208 203
##               247_AnnualReport_Shell_2020.txt 16210 234186      5628 247
##               248_AnnualReport_Shell_2019.txt 15471 223496      5373 248
##               249_AnnualReport_Shell_2018.txt 15021 208112      4878 249
##               250_AnnualReport_Shell_2017.txt 14651 192587      4596 250
##               251_AnnualReport_Shell_2016.txt 13091 165700      4115 251
##               252_AnnualReport_Shell_2015.txt 12071 143206      3349 252
##               253_AnnualReport_Shell_2014.txt 11754 138184      3242 253
##               254_AnnualReport_Shell_2013.txt 11513 130806      3014 254
##               255_AnnualReport_Shell_2012.txt 11669 131995      3077 255
##               256_AnnualReport_Shell_2011.txt 11420 130451      3044 256
##          type            company year
##  AnnualReport        HuskyEnergy 2019
##  AnnualReport        HuskyEnergy 2018
##  AnnualReport        HuskyEnergy 2017
##  AnnualReport        HuskyEnergy 2016
##  AnnualReport        HuskyEnergy 2015
##  AnnualReport        HuskyEnergy 2013
##  AnnualReport        HuskyEnergy 2012
##  AnnualReport        HuskyEnergy 2010
##  AnnualReport        HuskyEnergy 2011
##  AnnualReport      CenovusEnergy 2020
##  AnnualReport      CenovusEnergy 2019
##  AnnualReport      CenovusEnergy 2018
##  AnnualReport      CenovusEnergy 2017
##  AnnualReport      CenovusEnergy 2016
##  AnnualReport      CenovusEnergy 2015
##  AnnualReport      CenovusEnergy 2014
##  AnnualReport      CenovusEnergy 2013
##  AnnualReport      CenovusEnergy 2012
##  AnnualReport      CenovusEnergy 2011
##  AnnualReport IndustrialAlliance 2010
##  AnnualReport IndustrialAlliance 2011
##  AnnualReport IndustrialAlliance 2012
##  AnnualReport IndustrialAlliance 2013
##  AnnualReport IndustrialAlliance 2014
##  AnnualReport IndustrialAlliance 2015
##  AnnualReport IndustrialAlliance 2016
##  AnnualReport IndustrialAlliance 2017
##  AnnualReport IndustrialAlliance 2018
##  AnnualReport IndustrialAlliance 2019
##  AnnualReport              Shell 2020
##  AnnualReport              Shell 2019
##  AnnualReport              Shell 2018
##  AnnualReport              Shell 2017
##  AnnualReport              Shell 2016
##  AnnualReport              Shell 2015
##  AnnualReport              Shell 2014
##  AnnualReport              Shell 2013
##  AnnualReport              Shell 2012
##  AnnualReport              Shell 2011
data_texts_dfm <- dfm(text_corpus, 
                      remove = stopwords("english"), 
                      remove_punct = TRUE, 
                      remove_numbers = TRUE,
                      remove_symbols = TRUE,
                      split_hyphens = TRUE,
                      stem=T)
data_texts_dfm <- dfm_tolower(data_texts_dfm, keep_acronyms = FALSE)
data_texts_dfm
## Document-feature matrix of: 39 documents, 20,022 features (84.6% sparse) and 4 docvars.
##                                        features
## docs                                    energi driven annual report corpor
##   054_AnnualReport_HuskyEnergy_2019.txt    335      7     56    100     52
##   055_AnnualReport_HuskyEnergy_2018.txt    314      5     64     97     43
##   056_AnnualReport_HuskyEnergy_2017.txt     65      8     57     86     39
##   057_AnnualReport_HuskyEnergy_2016.txt     73      5     53     86     40
##   058_AnnualReport_HuskyEnergy_2015.txt     57      1     36     90     41
##   071_AnnualReport_HuskyEnergy_2013.txt    118      1    104    150     42
##                                        features
## docs                                    profil huski integr compani base
##   054_AnnualReport_HuskyEnergy_2019.txt      1   459     52     841  151
##   055_AnnualReport_HuskyEnergy_2018.txt      1   434     44     861  142
##   056_AnnualReport_HuskyEnergy_2017.txt      6   212     30     925  142
##   057_AnnualReport_HuskyEnergy_2016.txt      4   262     24     788  149
##   058_AnnualReport_HuskyEnergy_2015.txt      2   196     23     664  135
##   071_AnnualReport_HuskyEnergy_2013.txt      2   331     29     647  143
## [ reached max_ndoc ... 33 more documents, reached max_nfeat ... 20,012 more features ]
kwic_text <- kwic(text_corpus, c("risk", "climate"), window = 3, 
             valuetype = "glob", case_insensitive = T)
head(kwic_text, n = 10)
##                                                                               
##   [054_AnnualReport_HuskyEnergy_2019.txt, 6211]  Standards and 5.0 |  Risk   |
##   [054_AnnualReport_HuskyEnergy_2019.txt, 6213]       5.0 Risk and |  Risk   |
##   [054_AnnualReport_HuskyEnergy_2019.txt, 6223]  57 5.1 Enterprise |  Risk   |
##   [054_AnnualReport_HuskyEnergy_2019.txt, 6232] 59 5.2 Significant |  Risk   |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 11383]     the impacts of | climate |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 11391] safety, enterprise |  risk   |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 11600]    a discussion on |  Risk   |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 11602]        on Risk and |  Risk   |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 21255]           Inc. 5.0 |  Risk   |
##  [054_AnnualReport_HuskyEnergy_2019.txt, 21257]       5.0 Risk and |  Risk   |
##                           
##  and Risk Management      
##  Management 43 Changes    
##  Management 43 9.0        
##  Factors 43 9.1           
##  change, health           
##  management, resource     
##  and Risk Management      
##  Management, see          
##  and Risk Management      
##  Management 5.1 Enterprise
View(kwic_text)
#View(data_suncor_dfm)
#This can be practical if we want to know whether a word like risk comes in the combination of climate.
#I run the keyword-in-context command in R. I specify two words (Risk and climate) and tell R to show us the three words to either sides of it. Moreover, I take valuetype = "globe" to indicate that all version of word should be looked for, and lastly specify case\_insensitive=T to ensure it looks only for letters regardless of cases.
half2text_corpus <- corpus_subset(text_corpus, year >= 2015)
half1text_corpus <- corpus_subset(text_corpus, year < 2015)

kwic_absoluterisk2010 <- textplot_xray(
  kwic(half1text_corpus, pattern = "risk"), 
  kwic(half1text_corpus, pattern = "climate"), scale = "absolute") 
kwic_absoluterisk2015 <- textplot_xray(
  kwic(half2text_corpus, pattern = "risk"), 
  kwic(half2text_corpus, pattern = "climate"), scale = "absolute") 
kwic_relativerisk2010 <- textplot_xray(
  kwic(half1text_corpus, pattern = "risk"),
  kwic(half1text_corpus, pattern = "climate"), scale = "relative") 
kwic_relativerisk2015 <- textplot_xray(
  kwic(half2text_corpus, pattern = "risk"),
  kwic(half2text_corpus, pattern = "climate"), scale = "relative") 


kwic_absoluterisk2010 + aes(color = keyword) +
    scale_color_manual(values = c("red", "green")) +
    theme(legend.position = "none")

kwic_absoluterisk2015 + aes(color = keyword) +
    scale_color_manual(values = c("red", "green")) +
    theme(legend.position = "none")

kwic_relativerisk2010 + aes(color = keyword) +
    scale_color_manual(values = c("red", "green")) +
    theme(legend.position = "none")

kwic_relativerisk2015 + aes(color = keyword) +
    scale_color_manual(values = c("red", "green")) +
    theme(legend.position = "none")

features <- topfeatures(data_texts_dfm, 50)
features_plot <- data.frame(list(term = names(features),frequency = unname(features)))
features_plot$term <- with(features_plot, reorder(term, -frequency))

#which I then sort by decreasing frequency order (this is optional, if you do not specify this, the graph will be sorted on the alphabetical order of the words on the x-axis instead).
ggplot(features_plot) + geom_point(aes(x=term, y=frequency)) +
    theme(axis.text.x=element_text(angle=90, hjust=1))

wordcloud_dfm_trim <- dfm_trim(data_texts_dfm, min_termfreq = 40)
textplot_wordcloud(wordcloud_dfm_trim)

head(docvars(text_corpus))
##   num         type     company year
## 1  54 AnnualReport HuskyEnergy 2019
## 2  55 AnnualReport HuskyEnergy 2018
## 3  56 AnnualReport HuskyEnergy 2017
## 4  57 AnnualReport HuskyEnergy 2016
## 5  58 AnnualReport HuskyEnergy 2015
## 6  71 AnnualReport HuskyEnergy 2013
wordcloud_dfm_comp <- dfm(text_corpus, groups = "company", 
    remove = stopwords("english"), remove_punct = TRUE)
textplot_wordcloud(dfm_trim(wordcloud_dfm_comp, 
                            min_termfreq = 2000,
                            max_words = 5000), 
                   comparison = TRUE, 
                   random_color=F, 
                   color = col,
                   labelcolor=T)

#Dictionary approach

dic_list <- list(risk = c("uncertain*", "risk*", "concern*"), 
                 climate = c("climat*", "environment*"), 
                 government = c("tax*", "regulat*", "legislat*"), 
                 country = "canada")
dic_created <- dictionary(dic_list, tolower = FALSE)
dic_created
## Dictionary object with 4 key entries.
## - [risk]:
##   - uncertain*, risk*, concern*
## - [climate]:
##   - climat*, environment*
## - [government]:
##   - tax*, regulat*, legislat*
## - [country]:
##   - canada
dictionary_results <- dfm_lookup(data_texts_dfm, dic_created)
dictionary_results
## Document-feature matrix of: 39 documents, 4 features (0.0% sparse) and 4 docvars.
##                                        features
## docs                                    risk climate government country
##   054_AnnualReport_HuskyEnergy_2019.txt  176      53        236     120
##   055_AnnualReport_HuskyEnergy_2018.txt  190      53        250     135
##   056_AnnualReport_HuskyEnergy_2017.txt  180      40        251     171
##   057_AnnualReport_HuskyEnergy_2016.txt  177      41        241     157
##   058_AnnualReport_HuskyEnergy_2015.txt  167      37        197     129
##   071_AnnualReport_HuskyEnergy_2013.txt  176      31        185     118
## [ reached max_ndoc ... 33 more documents ]
require(Matrix)
## Loading required package: Matrix
textdistmat <- dist(as.matrix(dfm_weight(wordcloud_dfm_trim, scheme = "prop")))

presCluster <- hclust(textdistmat)
# label with document names
presCluster$labels <- docnames(wordcloud_dfm_trim)
# plot as a dendrogram
plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency", hang = -1, cex = 0.4)

library(quanteda.textmodels)
#wordfishdfm@Dimnames$docs
wordfishdfm <- dfm_trim(data_texts_dfm, min_termfreq = 1000) #only choose docs with termfreq=1000
wordfish_texts <- textmodel_wordfish(wordfishdfm, 
                                     dir = c(9, 30),
                                     dispersion = "poisson")
summary(wordfish_texts)
## 
## Call:
## textmodel_wordfish.dfm(x = wordfishdfm, dir = c(9, 30), dispersion = "poisson")
## 
## Estimated Document Positions:
##                                                theta        se
## 054_AnnualReport_HuskyEnergy_2019.txt         0.3117 0.0013157
## 055_AnnualReport_HuskyEnergy_2018.txt         0.3074 0.0013646
## 056_AnnualReport_HuskyEnergy_2017.txt         0.2991 0.0014858
## 057_AnnualReport_HuskyEnergy_2016.txt         0.3068 0.0014214
## 058_AnnualReport_HuskyEnergy_2015.txt         0.3011 0.0016023
## 071_AnnualReport_HuskyEnergy_2013.txt         0.2911 0.0017500
## 072_AnnualReport_HuskyEnergy_2012.txt         0.2934 0.0016520
## 073_AnnualReport_HuskyEnergy_2010.txt         0.3276 0.0013425
## 073_AnnualReport_HuskyEnergy_2011.txt         0.2234 0.0031551
## 075_AnnualReport_CenovusEnergy_2020.txt       0.2942 0.0015541
## 076_AnnualReport_CenovusEnergy_2019.txt       0.2898 0.0016174
## 077_AnnualReport_CenovusEnergy_2018.txt       0.2933 0.0015611
## 078_AnnualReport_CenovusEnergy_2017.txt       0.2985 0.0017058
## 079_AnnualReport_CenovusEnergy_2016.txt       0.2882 0.0019289
## 080_AnnualReport_CenovusEnergy_2015.txt       0.3230 0.0012228
## 081_AnnualReport_CenovusEnergy_2014.txt       0.3084 0.0014357
## 082_AnnualReport_CenovusEnergy_2013.txt       0.3277 0.0011420
## 083_AnnualReport_CenovusEnergy_2012.txt       0.3206 0.0011254
## 084_AnnualReport_CenovusEnergy_2011.txt       0.3266 0.0012397
## 194_AnnualReport_IndustrialAlliance_2010.txt -0.5458 0.0099025
## 195_AnnualReport_IndustrialAlliance_2011.txt -0.5928 0.0093776
## 196_AnnualReport_IndustrialAlliance_2012.txt -0.6023 0.0102511
## 197_AnnualReport_IndustrialAlliance_2013.txt -0.5770 0.0099597
## 198_AnnualReport_IndustrialAlliance_2014.txt -0.6001 0.0103286
## 199_AnnualReport_IndustrialAlliance_2015.txt -0.6019 0.0104904
## 200_AnnualReport_IndustrialAlliance_2016.txt -0.5985 0.0103517
## 201_AnnualReport_IndustrialAlliance_2017.txt -0.5783 0.0108405
## 202_AnnualReport_IndustrialAlliance_2018.txt -0.5485 0.0107309
## 203_AnnualReport_IndustrialAlliance_2019.txt -5.5363 0.0027482
## 247_AnnualReport_Shell_2020.txt               0.5068 0.0001504
## 248_AnnualReport_Shell_2019.txt               0.5080 0.0001521
## 249_AnnualReport_Shell_2018.txt               0.5115 0.0001528
## 250_AnnualReport_Shell_2017.txt               0.5128 0.0001580
## 251_AnnualReport_Shell_2016.txt               0.5158 0.0001625
## 252_AnnualReport_Shell_2015.txt               0.4998 0.0001932
## 253_AnnualReport_Shell_2014.txt               0.4984 0.0001985
## 254_AnnualReport_Shell_2013.txt               0.4991 0.0002020
## 255_AnnualReport_Shell_2012.txt               0.4983 0.0002010
## 256_AnnualReport_Shell_2011.txt               0.4990 0.0002013
## 
## Estimated Feature Scores:
##      energi  annual report   corpor   huski integr  compani   base alberta
## beta  2.012 0.05727  0.276 -0.02587 0.03564  1.091 -0.09401 0.3395   1.037
## psi   4.216 5.33708  5.740  4.73908 4.40965  3.552  6.47200 5.0018   3.402
##       common  share  public  trade   stock exchang   oper    canada    unit
## beta -0.2241 0.2056 0.06821 0.9426 -0.1984   0.302 0.3599 -0.001078 0.01828
## psi   4.4872 6.0490 3.30976 3.9669  4.3444   4.609 5.8816  4.534646 4.29780
##       state  asia  region upstream downstream     busi segment    two    main
## beta 0.2258 4.907 0.07797    3.485      3.275 -0.07162   0.307 0.1045 -0.1091
## psi  3.6020 2.387 3.41652    3.368      2.929  5.16800   3.994 3.6984  3.6436
##       area  focus includ
## beta 1.690 0.1312 0.2068
## psi  3.221 3.4076 5.6247
textplot_scale1d(wordfish_texts)

textplot_scale1d(wordfish_texts, groups = wordcloud_dfm_trim$company)

textplot_scale1d(wordfish_texts, margin = "features", 
                 highlighted = c("climat", "risk"))

textplot_scale1d(wordfish_texts, margin = "documents")

predict(wordfish_texts, interval = "confidence")
## $fit
##                                                     fit        lwr        upr
## 054_AnnualReport_HuskyEnergy_2019.txt         0.3116844  0.3091058  0.3142631
## 055_AnnualReport_HuskyEnergy_2018.txt         0.3073945  0.3047200  0.3100690
## 056_AnnualReport_HuskyEnergy_2017.txt         0.2991271  0.2962149  0.3020392
## 057_AnnualReport_HuskyEnergy_2016.txt         0.3068291  0.3040431  0.3096150
## 058_AnnualReport_HuskyEnergy_2015.txt         0.3010559  0.2979155  0.3041963
## 071_AnnualReport_HuskyEnergy_2013.txt         0.2910691  0.2876392  0.2944990
## 072_AnnualReport_HuskyEnergy_2012.txt         0.2934328  0.2901948  0.2966707
## 073_AnnualReport_HuskyEnergy_2010.txt         0.3276491  0.3250178  0.3302804
## 073_AnnualReport_HuskyEnergy_2011.txt         0.2234487  0.2172648  0.2296325
## 075_AnnualReport_CenovusEnergy_2020.txt       0.2942164  0.2911704  0.2972624
## 076_AnnualReport_CenovusEnergy_2019.txt       0.2897672  0.2865971  0.2929372
## 077_AnnualReport_CenovusEnergy_2018.txt       0.2933042  0.2902445  0.2963640
## 078_AnnualReport_CenovusEnergy_2017.txt       0.2984973  0.2951541  0.3018406
## 079_AnnualReport_CenovusEnergy_2016.txt       0.2882101  0.2844296  0.2919906
## 080_AnnualReport_CenovusEnergy_2015.txt       0.3230175  0.3206210  0.3254140
## 081_AnnualReport_CenovusEnergy_2014.txt       0.3083649  0.3055509  0.3111789
## 082_AnnualReport_CenovusEnergy_2013.txt       0.3277134  0.3254751  0.3299517
## 083_AnnualReport_CenovusEnergy_2012.txt       0.3205909  0.3183852  0.3227965
## 084_AnnualReport_CenovusEnergy_2011.txt       0.3266069  0.3241771  0.3290367
## 194_AnnualReport_IndustrialAlliance_2010.txt -0.5458133 -0.5652217 -0.5264048
## 195_AnnualReport_IndustrialAlliance_2011.txt -0.5928042 -0.6111840 -0.5744245
## 196_AnnualReport_IndustrialAlliance_2012.txt -0.6022579 -0.6223498 -0.5821661
## 197_AnnualReport_IndustrialAlliance_2013.txt -0.5769753 -0.5964960 -0.5574546
## 198_AnnualReport_IndustrialAlliance_2014.txt -0.6001356 -0.6203793 -0.5798919
## 199_AnnualReport_IndustrialAlliance_2015.txt -0.6018909 -0.6224517 -0.5813301
## 200_AnnualReport_IndustrialAlliance_2016.txt -0.5984882 -0.6187772 -0.5781991
## 201_AnnualReport_IndustrialAlliance_2017.txt -0.5782648 -0.5995118 -0.5570178
## 202_AnnualReport_IndustrialAlliance_2018.txt -0.5485127 -0.5695449 -0.5274806
## 203_AnnualReport_IndustrialAlliance_2019.txt -5.5362530 -5.5416393 -5.5308668
## 247_AnnualReport_Shell_2020.txt               0.5067629  0.5064681  0.5070577
## 248_AnnualReport_Shell_2019.txt               0.5080365  0.5077383  0.5083347
## 249_AnnualReport_Shell_2018.txt               0.5115178  0.5112182  0.5118173
## 250_AnnualReport_Shell_2017.txt               0.5127528  0.5124431  0.5130626
## 251_AnnualReport_Shell_2016.txt               0.5158130  0.5154944  0.5161315
## 252_AnnualReport_Shell_2015.txt               0.4997502  0.4993714  0.5001289
## 253_AnnualReport_Shell_2014.txt               0.4984099  0.4980208  0.4987990
## 254_AnnualReport_Shell_2013.txt               0.4990824  0.4986864  0.4994783
## 255_AnnualReport_Shell_2012.txt               0.4983240  0.4979301  0.4987180
## 256_AnnualReport_Shell_2011.txt               0.4989671  0.4985725  0.4993617