You can find a tutorial for the whole process here: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
The PDF of the PTM framework is here: https://www.bps.org.uk/news-and-policy/introducing-power-threat-meaning-framework
Convert PDF to Text
Conversion by pdftotext (you’ll need poppler-utils to do this)
pdf.fname <- "'INF299 PTM Main web.pdf'"
txt.fname <- "PTMF.txt"
sys.cmd <- paste("pdftotext", pdf.fname, txt.fname)
system(sys.cmd)
Read the text file
txt <- readLines(txt.fname)
incomplete final line found on 'PTMF.txt'
docs <- Corpus(VectorSource(txt))
# some cleaning up : mainly removing extraneous formatting
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, toSpace, "●")
docs <- tm_map(docs, toSpace, "■■")
docs <- tm_map(docs, toSpace, "•")
docs <- tm_map(docs, toSpace, "‘")
docs <- tm_map(docs, toSpace, "’")
docs <- tm_map(docs, toSpace, "–")
docs <- tm_map(docs, toSpace, "\\t")
docs <- tm_map(docs, toSpace, "\\f")
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
Build Term-Document Matrix and Wordcloud
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
# clear some space
rm(docs)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, scale = c(2,0.5), min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1"))

Some More Quantitative Plots
Examine the frequency ‘drop off’ from highest to lowest occuring words (this will help filter the results)
plot( d$freq, type = "l", xaxt="n", xlab = "Rank", ylab = "Frequency" )
axis(1, at = c(0,nrow(d)), labels = c("Highest", "Lowest"))

Visually, it seems at around a frequency of 40 there is a rapid fall in word occurrence, so we’ll strip out those words. Look at the highest frequency terms:
d[ which( d$freq > 40 ), ]
Now, removing “power”, “threat”, “meaning” and “framework” (as they after all, the title of the document and likely to be extensively used):
rmv.idx <- which( d$word %in% c("power","threat","meaning","framework") )
d2 <- d[ -rmv.idx, ]
# keep terms with freq > 40
d2 <- d2[ which( d$freq > 40 ), ]
And replot the wordcloud:
wordcloud(words = d2$word, freq = d2$freq, scale = c(2,0.5), min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1"))

Word Associations
Revisiting again the most frequent words (here, the top 10):
head( d2, 10 )
Note the high frequency of the modal verbs “may” and “can”. We can try and see what other terms correlate highly with these:
assc <- findAssocs(dtm, c("may","can"), c(0.0,0.0))
par(mfrow=c(1,2))
barplot(assc[[1]][1:15], las = 2,
col ="lightblue", main ="May",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:15], las = 2,
col ="lightblue", main ="Can",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

“may” correlates most with “also”, “describe” and “compound”, whereas “can” with “seen” (as in “can be seen”) and “accommodated” and “counteracted” (as in “can be counteracted”).
It might be instructive to look at the top three words: “social”, “mental” and “health”
assc <- findAssocs(dtm, c("social","mental","health"), c(0.0, 0.0, 0.0))
par(mfrow=c(2,2))
barplot(assc[[1]][1:10], las = 2,
col ="lightblue", main ="Social",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:10], las = 2,
col ="lightblue", main ="Mental",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[3]][1:10], las = 2,
col ="lightblue", main ="Health",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

Now, associations with some keywords. Plot correlations of “psychiatric” and “psychiatry”
assc <- findAssocs(dtm, c("psychiatric","psychiatry"), c(0.0,0.0))
par(mfrow=c(1,2))
barplot(assc[[1]][1:10], las = 2,
col ="lightblue", main ="Psychiatric",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:10], las = 2,
col ="lightblue", main ="Psychiatry",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

Let’s examine for any contrasts with “psychology” and “psychological” (but here, we’ll have to filter “british”, “society”, “clinical”, “divisions” and “forum” to eliminate hits professional body names, “january” because footer contains both “psychological” and “january”, “ptmmain” and “wwwbpsorguk” because it’s a weblink frequently cited to the forthcoming “main” framework document)
assc.psychology <- findAssocs(dtm, c("psychology"), c(0.0) )[[1]]
assc.psychology <- assc.psychology[ -which( names(assc.psychology) %in%
c("british", "society", "clinical",
"division", "divisions", "forum", "january", "ptmmain", "wwwbpsorguk") ) ]
assc.psychological <- findAssocs(dtm, c("psychological"), c(0.0) )[[1]]
assc.psychological <- assc.psychological[ -which( names(assc.psychological) %in%
c("british", "society", "clinical",
"division", "divisions", "forum", "january", "ptmmain", "wwwbpsorguk") ) ]
par(mfrow=c(1,2))
barplot(assc.psychology[1:10], las = 2,
col ="lightblue", main ="Psychology",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc.psychological[1:10], las = 2,
col ="lightblue", main ="Psychological",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

Examine the correlating terms for “power”, “threat” and “meaning” (but filtering so the title of the document is not the highest correlation, so we remove “overview”, “framework” as well)
assc.power <- findAssocs(dtm, c("power"), c(0.0))[[1]]
#remove "threat" and "meaning"
assc.power <- assc.power[ -which( names(assc.power) %in% c("threat","meaning","overview","framework") ) ]
assc.threat <- findAssocs(dtm, c("threat"), c(0.0))[[1]]
#remove "power" and "meaning"
assc.threat <- assc.threat[ -which( names(assc.threat) %in% c("power","meaning","overview","framework") ) ]
assc.meaning <- findAssocs(dtm, c("meaning"), c(0.0))[[1]]
#remove "power" and "threat"
assc.meaning <- assc.meaning[ -which( names(assc.meaning) %in% c("power","threat","overview","framework") ) ]
par(mfrow=c(2,2))
barplot(assc.power[1:10], las = 2,
col ="lightblue", main ="Power",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc.threat[1:10], las = 2,
col ="lightblue", main ="Threat",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc.meaning[1:10], las = 2,
col ="lightblue", main ="Meaning",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

More Controversial Keywords
Perhaps more loaded, is to ask what correlates with “empirical” and “evidence” in the document:
assc <- findAssocs(dtm, c("empirical","evidence"), c(0.0))
par(mfrow=c(1,2))
barplot(assc[[1]][1:10], las = 2,
col ="lightblue", main ="Empirical",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:10], las = 2,
col ="lightblue", main ="Evidence",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

And for “medical” and “medicalisation”
assc <- findAssocs(dtm, c("medical","medicalisation"), c(0.0))
par(mfrow=c(1,2))
barplot(assc[[1]][1:10], las = 2,
col ="lightblue", main ="Medical",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:10], las = 2,
col ="lightblue", main ="Medicalisation",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

And finally “formulation” and “narrative”:
assc <- findAssocs(dtm, c("formulation","narrative"), c(0.0))
par(mfrow=c(1,2))
barplot(assc[[1]][1:10], las = 2,
col ="lightblue", main ="formulation",
ylab = "Correlations",cex = 0.8, cex.lab=1.0, cex.axis=1.0 )
barplot(assc[[2]][1:10], las = 2,
col ="lightblue", main ="narrative",
ylab = "Correlations", cex = 0.8, cex.lab=1.0, cex.axis=1.0 )

