PHE produces numerous reports, bulletins, and communications, and receives large amounts of feedback. In recent years the ability to analyse text as data has developed rapidly, and there are now tools which can help us gain insight from documents and bodies of texts. These tools allow us to rapidly analyse large numbers of documents. This note applies some of these techniques to analysing Duncan Selbie’s Friday Messages.
There are a number of steps:
This analysis is conducted using the statistical package R
which is rapidly becoming the main tool for undertaking this kind of analysis.
knitr::opts_chunk$set(echo = FALSE)
First we need to load the libraries for the analysis.
library(knitr)
library(knitcitations)
suppressPackageStartupMessages(library(rvest))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(httr))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(pdftools))
suppressPackageStartupMessages(library(tidytext))
suppressPackageStartupMessages(library(ggplot2))
source("~/themejf.R") ## A standard theme for plots
Then we need to get teh data. This process identifies the Friday Bulletins on the web, automatically downloads them and stores the pdf files is a form they can be further analysed.
## Scraping bulletins
setwd("~/friday_messages")
url <- "https://www.gov.uk/government/collections/duncan-selbies-friday-messages"
page <- read_html(url)
urls <- page %>%
html_nodes("a") %>% # find all links
html_attr("href") # get the url
urls <- urls[grepl("friday-messages",urls)]
urls <- paste0("https://www.gov.uk",urls)
for (pg in urls){
pg <- read_html(pg)
pg <- pg %>%
html_nodes("a") %>%
html_attr("href")
pdf <- unique(pg[grepl("^/government",pg) & grepl("[Pp][Dd][Ff]$",pg)])
pdf <- paste0("https://www.gov.uk",pdf)
download.file(pdf,basename(pdf), quiet = TRUE)
}
This file names contain useful information - for example, the date of publication, which we can extract and use.
library(stringr)
## test
files <- list.files(pattern = "pdf")
file <- data_frame(files) %>%
mutate(doc = row_number())
The bulletins have additional codes added by the process of coverting pdfs to text which we need to remove. We can also extract the date of publication from the file name.
corptd1 <- corptd %>%
mutate(files = str_replace_all(files, "DS_[Ff]riday_message", "")) %>%
mutate(files = str_replace_all(files, "Duncan_Selbie_s_[Ff]riday_message", "")) %>%
mutate(files = str_replace(files, "Thursday", "")) %>%
mutate(files = str_replace(files, "Wednesday", "")) %>%
mutate(files = str_replace_all(files, "___", "_")) %>%
mutate(files = str_replace_all(files, ".pdf$", "")) %>%
mutate(files = str_replace_all(files, "[Ff][Ii][Nn][Aa][Ll]$", ""))%>%
mutate(files = str_replace_all(files, "__", "_")) %>%
mutate(files = str_replace_all(files, "_$", "")) %>%
mutate(files = str_replace_all(files, "^_", "")) %>%
mutate(files = str_replace_all(files, "_", "-")) %>%
separate(files, c("day", "month", "year"), sep = "-") %>%
mutate(month = str_replace_all(month, "January", "01")) %>%
mutate(month = str_replace_all(month, "February", "02")) %>%
mutate(month = str_replace_all(month, "March", "03")) %>%
mutate(month = str_replace_all(month, "April", "04")) %>%
mutate(month = str_replace_all(month, "May", "05")) %>%
mutate(month = str_replace_all(month, "June", "06")) %>%
mutate(month = str_replace_all(month, "July", "07")) %>%
mutate(month = str_replace_all(month, "August", "08")) %>%
mutate(month = str_replace_all(month, "September", "09")) %>%
mutate(month = str_replace_all(month, "October", "10")) %>%
mutate(month = str_replace_all(month, "November", "11")) %>%
mutate(month = str_replace_all(month, "December", "12")) %>%
mutate(len = nchar(day)) %>%
mutate(day = ifelse(len == 1, paste0("0", day), day)) %>%
mutate(year = ifelse(nchar(year) ==2, paste0("20", year), year)) %>%
unite(date1, day, month, year, sep = "-") %>%
mutate(date2 = dmy(date1))
corptd1$text <- str_replace_all(corptd$text, "\n", "")
corptd1 <- corptd1 %>%
select(date2, text)
Stop words are common English words which occur frequently in all documents and are generally removed for analytical purposes. In addition, there are words common to all bulletins which add little value in analysis - we’ll add these to the stop_words
lexicon.
bulletin_sw <- data_frame(word = c("phe","friday", "week", "health", "published", "bulletin", "press", "public", "phe's", "www.gov.uk", "news", "publications", "gateway", "formore"), lexicon = "SMART" )
stop_words1 <- bind_rows(stop_words, bulletin_sw)
We can do some simple analysis.
corptd1 %>%
group_by(date2) %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)%>%
summarise(words = sum(n)) %>%
ggplot(aes(reorder(date2, words), words)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="",
y = "Number of words",
title = "Number of words per bulletin") +
theme(axis.text.y = element_text(size = 10))
And wordclouds…
library(wordcloud)
cloud <- corptd1 %>%
unnest_tokens(word, text) %>%
anti_join(stop_words1) %>%
count(word, sort = TRUE)
Joining, by = "word"
with(cloud, wordcloud(word, n, max.words = 'INF', scale = c(8, 0.2),
rot.per = 0.4, random.order = FALSE,
colors = brewer.pal(8, "Dark2")))
We can extend the anlaysis further by looking at the distribution of terms in individual bulletins, and then looking for patterns to see if bulletins can be clustered according to content.
First we can look at a single bulletin:
corptd1%>%
sample_n(1) %>%
knitr::kable(format = "pandoc")
date2 | text |
---|---|
2014-09-05 | Friday 5 September 2014Dear everyoneI am delighted to share the news that we have appointed Professor Derrick Crook as Director of Microbiologyat Public Health England, taking up the post on 1 January 2015. He has a fantastic track record in infectiousdiseases in the UK and internationally and is currently Professor of Microbiology at Oxford University andOxford University Hospitals NHS Trust. Professor Crook will lead the translation of genomic technologies totransform microbiology practice across England in full partnership between PHE and Universities hostingNIHR Health Protection Units and including Cambridge University. Dr Christine McCartney has very kindlyagreed to remain with PHE to work with Derrick as he takes up his new responsibilities.Flu is an unpredictable but recurring pressure that we face each winter. The last flu season was mild, but ofcourse we cannot assume this year’s will be. PHE is working with the NHS to plan and implement thenational flu immunisation programme so that the health system can be as prepared as possible. Our plansinclude a flu marketing campaign, launching on Monday 6 October, which aims to encourage those ‘at risk’ tohave the flu vaccination ‘because they need it’. This approach is based on research undertaken for thecampaign showing that the public’s attitudes to flu vaccination are largely rational. This campaign will run forfour weeks and will target those under 65s who have long-term health conditions, including pregnant womenwho are at risk of complications. It will also target healthy children aged 2-4 years, who are being offeredvaccine as part of a new strategy to improve flu control by reducing transmission in the community. Themarketing campaign will run nationally on radio and in print media, with a TV campaign in the North East tohelp us evaluate the effectiveness of this medium.Our quarterly senior leadership forum met this week to consider the themes emerging from our internalstrategic review of all our functions and services and how we deliver them. We will be sharing the finaloutcome as to what this means for the organisation in the coming weeks. While our first year was aboutkeeping everything going with nothing falling over as we established sound relationships with our partners,our second year has allowed us to do some serious soul searching through the strategic review and the PHEScience Hub Programme about what a really successful PHE looks like, delivering at home and abroad. Weare now ready to proceed with the implementation of the changes needed to lay the foundation of a decadeof success for PHE responding to today’s public health challenges in fulfilling our remit letter fromGovernment. As the renowned author Maya Angelou once observed, no one remembers what you said norwhat you did, but everyone remembers how you made them feel, and whatever change we are about to gothrough as we form what we hope will be the right structure and approach for the next number of years, ourtest of success won’t be the brilliance of the structure or the elegance of the management arrangements, buthow people experience the process and the outcome. There are relatively few things for which the ChiefExecutive has sole responsibility, this is one of them and it is my responsibility to decide how we mustchange and to make sure we do this in the best possible way.And finally, today we launched, in partnership with the LGA, the ADPH, NHS England, and the Department ofHealth, a comprehensive guide to the commissioning of sexual health, reproductive health, and HIV services.The aim of Making it work: a guide to whole system commissioning for sexual health, reproductive health andHIV is to help commissioners ensure that service users experience integrated, responsive services thatdeliver the best outcomes.With best wishes |
We can then create a per document per term table known as a Document Term Matrix (DTM). We can count the terms per document.
corp_dtm <- corptd1 %>%
group_by(date2, text) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words1) %>%
count(date2, word, sort = TRUE)
Joining, by = "word"
corp_dtm
Next we can create the DTM.
corp_dtm <- corp_dtm %>%
cast_dtm(date2, word, n)
corp_dtm
<<DocumentTermMatrix (documents: 96, terms: 7693)>>
Non-/sparse entries: 21221/717307
Sparsity : 97%
Maximal term length: 29
Weighting : term frequency (tf)
We can see which words tend to appear together in the bulletins.
## For example...
### Alcohol
findAssocs(corp_dtm, "alcohol", 0.75)
$alcohol
harm
0.75
### Sugar
findAssocs(corp_dtm, "sugar", 0.6)
$sugar
food drink
0.61 0.60
findAssocs(corp_dtm, "fingertips", 0.7) %>%
unlist()
fingertips.securing fingertips.amajor fingertips.andintegration
1.0 1.0 1.0
fingertips.areasand fingertips.aspossible fingertips.beingestablished
1.0 1.0 1.0
fingertips.comparison fingertips.conversation fingertips.datacatalogue
1.0 1.0 1.0
fingertips.deals fingertips.dementiaservices fingertips.durham
1.0 1.0 1.0
fingertips.feedback.and fingertips.filled fingertips.gateshead
1.0 1.0 1.0
fingertips.hardwith fingertips.havebenefited fingertips.honest
1.0 1.0 1.0
fingertips.implemented fingertips.indementia fingertips.inone
1.0 1.0 1.0
fingertips.issued fingertips.it.phe’s fingertips.knowndoctors
1.0 1.0 1.0
fingertips.ledthis fingertips.modern fingertips.newcastle
1.0 1.0 1.0
fingertips.northtyneside fingertips.northumberland fingertips.offerthem
1.0 1.0 1.0
fingertips.ourremit fingertips.pathway fingertips.profile.it
1.0 1.0 1.0
fingertips.programme.this fingertips.readily fingertips.stimulate
1.0 1.0 1.0
fingertips.stop.with fingertips.summer.with fingertips.sunderland
1.0 1.0 1.0
fingertips.tacklinghealth fingertips.thegreatest fingertips.theirdevolution
1.0 1.0 1.0
fingertips.theoutcome fingertips.theoutcomes fingertips.thestrategic
1.0 1.0 1.0
fingertips.thisourselves fingertips.transparentand fingertips.variationexists
1.0 1.0 1.0
fingertips.volunteered fingertips.worried fingertips.lists
1.0 1.0 0.7
fingertips.waspublished fingertips.requirements fingertips.additionally
0.7 0.7 0.7
fingertips.dependent fingertips.authors fingertips.bmj
0.7 0.7 0.7
fingertips.promising fingertips.2,500 fingertips.avoided
0.7 0.7 0.7
fingertips.bespoke fingertips.investigate fingertips.platform
0.7 0.7 0.7
And the next step is topic modelling - this allows us to analyse the whole body of bulletins and look for themes or topics - groupings of words within and between documents.
library(topicmodels)
corp_lda <- LDA(corp_dtm, k = 8, control = list(seed = 1234))
corp_lda
A LDA_VEM topic model with 8 topics.
corp_lda_tidy <- tidy(corp_lda)
corp_lda_tidy %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ggplot(aes(term, beta, fill = factor(topic), label = term)) +
geom_bar(stat = "identity") +
geom_text(hjust = 0) +
coord_flip() +
facet_wrap(~topic, ncol =4) +
theme_jf() +
labs(fill = "Topic")
NA
NA
corp_lda_tidy %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_point(aes(colour = factor(topic))) +
geom_line(aes(group = topic)) +
coord_polar() +
facet_wrap(~topic, ncol = 4) +
theme(legend.position = "") +
theme(axis.text.x = element_text(size = 12))
NA
lda_gamma <- tidytext:::tidy.LDA(corp_lda, matrix = "gamma")
lda_gamma
abs_class <- lda_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup() %>%
arrange(gamma)
abs_class
abs_class %>%
group_by(topic) %>%
count()
abs_class %>%
ggplot(aes(ymd(document), as.factor(topic), label = document)) +
geom_point(aes(colour = topic)) +
geom_path(aes(colour = topic))+
geom_text(size = 2, angle = 45, hjust = 0, vjust = 0)+
coord_cartesian(ylim = c(0, 9)) +
labs(y = "Topic number" ) +
theme(legend.position = "")
NA