# libraries
require(topicmodels)
## Zorunlu paket yükleniyor: topicmodels
require(pdftools)
## Zorunlu paket yükleniyor: pdftools
## Using poppler version 21.04.0
require(tm)
## Zorunlu paket yükleniyor: tm
## Zorunlu paket yükleniyor: NLP
require(tidytext)
## Zorunlu paket yükleniyor: tidytext
require(ggplot2)
## Zorunlu paket yükleniyor: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(dplyr)
## Zorunlu paket yükleniyor: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# loads all pdf files
all_files <- list.files(pattern = "pdf$")
all_opinions <- lapply(all_files, pdf_text)
# corpus is a database containing words
document <- Corpus(VectorSource(all_opinions)) # create corpus
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
## Warning in tm_map.SimpleCorpus(document, content_transformer(tolower)):
## transformation drops documents
document <- tm_map(document, removeNumbers) # remove numbers from document
## Warning in tm_map.SimpleCorpus(document, removeNumbers): transformation drops
## documents
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
## Warning in tm_map.SimpleCorpus(document, removeWords, stopwords("english")):
## transformation drops documents
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
## Warning in tm_map.SimpleCorpus(document, removePunctuation,
## preserve_intra_word_dashes = TRUE): transformation drops documents
document <- tm_map(document, stripWhitespace) # removewhite space
## Warning in tm_map.SimpleCorpus(document, stripWhitespace): transformation drops
## documents
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
model_lda
## A LDA_VEM topic model with 4 topics.
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
## # A tibble: 14,684 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 aal 1.06e- 65
## 2 2 aal 6.65e- 4
## 3 3 aal 1.55e-102
## 4 4 aal 2.77e-103
## 5 1 ability 2.74e- 64
## 6 2 ability 9.97e- 4
## 7 3 ability 8.02e- 4
## 8 4 ability 6.96e- 4
## 9 1 abilitynand 4.62e- 66
## 10 2 abilitynand 3.32e- 4
## # ... with 14,674 more rows
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()

# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education
# topic 4 is about education, craft, tech subsject & curricululum
# filters terms by topics
tidy(dtm) %>%
filter(document == 3) %>%
arrange(desc(count))
## # A tibble: 1,717 x 3
## document term count
## <chr> <chr> <dbl>
## 1 3 education 119
## 2 3 craft 102
## 3 3 technology 92
## 4 3 girls 57
## 5 3 boys 56
## 6 3 curriculum 52
## 7 3 school 43
## 8 3 students 39
## 9 3 estonia 36
## 10 3 subject 33
## # ... with 1,707 more rows
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)

# when we interpret the chart
# document 4 is made up topic 1
# document 1 is made up topic 2
# document 2 is made up topic 3
# document 3 is made up topic 4