topic_modeling

# libraries
require(topicmodels)

## Zorunlu paket yükleniyor: topicmodels

require(pdftools)

## Zorunlu paket yükleniyor: pdftools

## Using poppler version 21.04.0

require(tm)

## Zorunlu paket yükleniyor: tm

## Zorunlu paket yükleniyor: NLP

require(tidytext)

## Zorunlu paket yükleniyor: tidytext

require(ggplot2)

## Zorunlu paket yükleniyor: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

require(dplyr)

## Zorunlu paket yükleniyor: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# loads all pdf files
all_files <- list.files(pattern = "pdf$")
all_opinions <- lapply(all_files, pdf_text)


# corpus is a database containing words
document <- Corpus(VectorSource(all_opinions)) # create corpus


# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case

## Warning in tm_map.SimpleCorpus(document, content_transformer(tolower)):
## transformation drops documents

document <- tm_map(document, removeNumbers) # remove numbers from document

## Warning in tm_map.SimpleCorpus(document, removeNumbers): transformation drops
## documents

document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English

## Warning in tm_map.SimpleCorpus(document, removeWords, stopwords("english")):
## transformation drops documents

document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)

## Warning in tm_map.SimpleCorpus(document, removePunctuation,
## preserve_intra_word_dashes = TRUE): transformation drops documents

document <- tm_map(document, stripWhitespace) # removewhite space

## Warning in tm_map.SimpleCorpus(document, stripWhitespace): transformation drops
## documents

# Create document-term matrix
dtm <- DocumentTermMatrix(document)


# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
model_lda

## A LDA_VEM topic model with 4 topics.

# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics

## # A tibble: 14,684 x 3
##    topic term             beta
##    <int> <chr>           <dbl>
##  1     1 aal         1.06e- 65
##  2     2 aal         6.65e-  4
##  3     3 aal         1.55e-102
##  4     4 aal         2.77e-103
##  5     1 ability     2.74e- 64
##  6     2 ability     9.97e-  4
##  7     3 ability     8.02e-  4
##  8     4 ability     6.96e-  4
##  9     1 abilitynand 4.62e- 66
## 10     2 abilitynand 3.32e-  4
## # ... with 14,674 more rows

# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  arrange(topic, -beta)


# Display the grouped terms on the charts
beta_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~ topic, scales = "free") + 
  scale_y_reordered()

# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education 
# topic 4 is about education, craft, tech subsject & curricululum

# filters terms by topics
tidy(dtm) %>%
  filter(document == 3) %>%
  arrange(desc(count))

## # A tibble: 1,717 x 3
##    document term       count
##    <chr>    <chr>      <dbl>
##  1 3        education    119
##  2 3        craft        102
##  3 3        technology    92
##  4 3        girls         57
##  5 3        boys          56
##  6 3        curriculum    52
##  7 3        school        43
##  8 3        students      39
##  9 3        estonia       36
## 10 3        subject       33
## # ... with 1,707 more rows

# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")

# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)


# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
                                group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)

# when we interpret the chart
# document 4 is made up topic 1
# document 1 is made up topic 2
# document 2 is made up topic 3
# document 3 is made up topic 4