library(tidyverse)
library(tidytext) 
library(topicmodels) 
library(tidyr) 
library(dplyr) 
library(ggplot2) 
library(kableExtra) 
library(knitr) 
library(ggrepel) 
library(gridExtra)
library(formattable) 
library(tm) 
library(circlize) 
library(plotly) 
library(wordcloud2)
library(lubridate)
library(stringr)
library(SnowballC)

#set parameters

#define some colors to use throughout
my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00", "#D65E00")

theme_plot <- function(aticks = element_blank(),
                         pgminor = element_blank(),
                         lt = element_blank(),
                         lp = "none")
{
  theme(plot.title = element_text(hjust = 0.5), #center the title
        axis.ticks = aticks, #set axis ticks to on or off
        panel.grid.minor = pgminor, #turn on or off the minor grid lines
        legend.title = lt, #turn on or off the legend title
        legend.position = lp) #turn on or off the legend
}

#customize the text tables for consistency using HTML formatting
my_kable_styling <- function(dat, caption) {
  kable(dat, "html", escape = FALSE, caption = caption) %>%
  kable_styling(bootstrap_options = c("striped", "condensed", "bordered"),
                full_width = FALSE)
}

word_chart <- function(data, input, title) {
  data %>%
  #set y = 1 to just plot one variable and use word as the label
  ggplot(aes(as.factor(row), 1, label = input, fill = factor(topic) )) +
  #you want the words, not the points
  geom_point(color = "transparent") +
  #make sure the labels don't overlap
  geom_label_repel(nudge_x = .2,  
                   direction = "y",
                   box.padding = 0.1,
                   segment.color = "transparent",
                   size = 3) +
  facet_grid(~topic) +
  theme_plot() +
  theme(axis.text.y = element_blank(), axis.text.x = element_blank(),
        #axis.title.x = element_text(size = 9),
        panel.grid = element_blank(), panel.background = element_blank(),
        panel.border = element_rect("lightgray", fill = NA),
        strip.text.x = element_text(size = 9)) +
  labs(x = NULL, y = NULL, title = title) +
    #xlab(NULL) + ylab(NULL) +
  #ggtitle(title) +
  coord_flip()
}
#read in literature review
review_data2 <-read_csv("data/review_updated_ai.csv")

Let’s inspect articles by published year

hist(review_data2$`Published Year`)

Tidy data by converting to lowercase, and only select abstract, published_year, journal, field and subfield. add a unique identifyer unite document as field.

# convert all variable names to lower case
names(review_data2) <- tolower(names(review_data2))
 
#Clean Data and include unique identifier
tidy_data2 <- review_data2 %>% 
  rename(published_year = `published year`)%>%
  select(c('abstract', 'published_year', 'journal', 'field', 'subfield')) %>% # only select 
  mutate(number = row_number())%>%
  unite(document, field)
  
# inspect
tidy_data2
## # A tibble: 137 x 6
##    abstract           published_year journal           document subfield  number
##    <chr>                       <dbl> <chr>             <chr>    <chr>      <int>
##  1 "With the rapid g~           2014 Knowledge-Based ~ ai       ai             1
##  2 "Plagiarism refer~           2017 Engineering Appl~ enginee~ eng_ai         2
##  3 "<bold>Introducti~           2018 International Jo~ technol~ tech_hea~      3
##  4 "Spoken dialog sy~           2015 Neurocomputing    technol~ tech_nc        4
##  5 "We designed a wo~           2015 Early Child Deve~ educati~ edu_early      5
##  6 "• Sequential dee~           2021 Safety Science    enginee~ eng_safe~      6
##  7 "The goal of this~           2019 International Jo~ educati~ edu_ai         7
##  8 "A Materials Acce~           2020 Advanced Science  science  sci_multi      8
##  9 "• Machine learni~           2020 Environment Inte~ science  sci_env_~      9
## 10 "Highlights: [•] ~           2014 Computers in Hum~ technol~ tech_hum~     10
## # ... with 127 more rows
library(ggplot2)

tidy_data2 %>%
  group_by(journal) %>%
  summarize(abstract = n_distinct(number)) %>%
  ggplot(aes(abstract, journal)) +
  geom_col() +
  scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
  labs(y = NULL)

Unnest and tokenize

#unnest
token_words2 <- tidy_data2 %>%
  unnest_tokens(word, abstract) %>%
  filter(str_detect(word, "[a-z']$"),
         !word %in% stop_words$word)
token_words2
## # A tibble: 20,135 x 6
##    published_year journal                 document subfield number word        
##             <dbl> <chr>                   <chr>    <chr>     <int> <chr>       
##  1           2014 Knowledge-Based Systems ai       ai            1 rapid       
##  2           2014 Knowledge-Based Systems ai       ai            1 growth      
##  3           2014 Knowledge-Based Systems ai       ai            1 data        
##  4           2014 Knowledge-Based Systems ai       ai            1 generated   
##  5           2014 Knowledge-Based Systems ai       ai            1 social      
##  6           2014 Knowledge-Based Systems ai       ai            1 web         
##  7           2014 Knowledge-Based Systems ai       ai            1 applications
##  8           2014 Knowledge-Based Systems ai       ai            1 paradigms   
##  9           2014 Knowledge-Based Systems ai       ai            1 generation  
## 10           2014 Knowledge-Based Systems ai       ai            1 knowledge   
## # ... with 20,125 more rows

create document term matrix and inspect

I also did this using journal as the document and had 137 documents with the same terms.

review_dtm2 <- token_words2 %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

cast_dtm2 <- review_dtm2 %>%
  cast_dtm(document, word, n)

dim(cast_dtm2)
## [1]    5 4068
cast_dtm2
## <<DocumentTermMatrix (documents: 5, terms: 4068)>>
## Non-/sparse entries: 6688/13652
## Sparsity           : 67%
## Maximal term length: 24
## Weighting          : term frequency (tf)

I can see that “copyright” is showing up in all 5 topics. I might need to go back and remove that word. Also, “author”, “abstract”

#look at 4 documents and 8 words of the DTM
inspect(cast_dtm2[1:5,1:8])
## <<DocumentTermMatrix (documents: 5, terms: 8)>>
## Non-/sparse entries: 40/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##              Terms
## Docs          abstract author copyright design learning students system users
##   ai                32     11        30     12       15       20     18    24
##   education         99     64        48     89      217      134     66    37
##   engineering       37     12        37      2       20        5     23    38
##   science           27     11        24      4       34        1      7    21
##   technology        88     36        73     32       99       14     25    66

Save

#assign the source dataset to generic var names

source_dtm2 <- cast_dtm2
source_tidy2 <- token_words2

Fit Model 1

We will use the GIBBS sampling method with the default VEM. We will classify documents into Topics based on the mean of gamma for a topic/source.

k <- 5 #number of topics
seed = 1234 #necessary for reproducibility
#fit the model 
#you could have more control parameters but will just use seed here
lda <- LDA(source_dtm2, k = k, method = "GIBBS", control = list(seed = seed))
#examine the class of the LDA object
class(lda)
## [1] "LDA_Gibbs"
## attr(,"package")
## [1] "topicmodels"

Extract the per-topic-per-word probabilities, called β from the model and show top 5 results in each topic.

review_topics <- tidy(lda, matrix = "beta")

top_terms <- review_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 5) %>% 
  ungroup() %>%
  arrange(topic, -beta)
top_terms
## # A tibble: 25 x 3
##    topic term           beta
##    <int> <chr>         <dbl>
##  1     1 students    0.0154 
##  2     1 users       0.0121 
##  3     1 tests       0.00870
##  4     1 engineering 0.00774
##  5     1 download    0.00630
##  6     2 bold        0.0113 
##  7     2 framework   0.0110 
##  8     2 neural      0.00769
##  9     2 game        0.00769
## 10     2 processing  0.00623
## # ... with 15 more rows
num_words <- 10 #number of words to visualize

#create function that accepts the lda model and num word to display
top_terms_per_topic <- function(lda_model, num_words) {

  #tidy LDA object to get word, topic, and probability (beta)
topics_tidy <- tidy(lda_model, matrix = "beta")


  top_terms <- topics_tidy %>%
  group_by(topic) %>%
  arrange(topic, desc(beta)) %>%
  #get the top num_words PER topic
  slice(seq_len(num_words)) %>%
  arrange(topic, beta) %>%
  #row is required for the word_chart() function
  mutate(row = row_number()) %>%
  ungroup() %>%
  #add the word Topic to the topic labels
  mutate(topic = paste("Topic", topic, sep = " "))
  #create a title to pass to word_chart
  title <- paste("LDA Top Terms for", k, "Topics")
  #call the word_chart function you built in prep work
  word_chart(top_terms, top_terms$term, title)
}
#call the function you just built!
top_terms_per_topic(lda, num_words)

Show relationship between topic and journal field

tidy_data2
## # A tibble: 137 x 6
##    abstract           published_year journal           document subfield  number
##    <chr>                       <dbl> <chr>             <chr>    <chr>      <int>
##  1 "With the rapid g~           2014 Knowledge-Based ~ ai       ai             1
##  2 "Plagiarism refer~           2017 Engineering Appl~ enginee~ eng_ai         2
##  3 "<bold>Introducti~           2018 International Jo~ technol~ tech_hea~      3
##  4 "Spoken dialog sy~           2015 Neurocomputing    technol~ tech_nc        4
##  5 "We designed a wo~           2015 Early Child Deve~ educati~ edu_early      5
##  6 "• Sequential dee~           2021 Safety Science    enginee~ eng_safe~      6
##  7 "The goal of this~           2019 International Jo~ educati~ edu_ai         7
##  8 "A Materials Acce~           2020 Advanced Science  science  sci_multi      8
##  9 "• Machine learni~           2020 Environment Inte~ science  sci_env_~      9
## 10 "Highlights: [•] ~           2014 Computers in Hum~ technol~ tech_hum~     10
## # ... with 127 more rows
#using tidy with gamma gets document probabilities into topic
#but you only have document, topic and gamma
source_topic_relationship <- tidy(lda, matrix = "gamma") %>%
  #join to orig tidy data bydoc to get the source field
  inner_join(tidy_data2, by = "document") %>%
  select(document, topic, gamma) %>%
  group_by(document, topic) %>%
  #get the avg doc gamma value per source/topic
  mutate(mean = mean(gamma)) %>%
  #remove the gamma value as you only need the mean
  select(-gamma) %>%
  #removing gamma created duplicates so remove them
  distinct()

#relabel topics to include the word Topic
source_topic_relationship$topic = paste("Topic", source_topic_relationship$topic, sep = " ")

circos.clear() #very important! Reset the circular layout parameters
#assign colors to the outside bars around the circle
grid.col = c("education" = my_colors[1],
             "science" = my_colors[2],
             "ai" = my_colors[3],
             "technology" = my_colors[4],
             "engineering"= my_colors[5],
             "Topic 1" = "grey", "Topic 2" = "grey", "Topic 3" = "grey", "Topic 4" = "grey", "Topic 5" = "grey")

# set the global parameters for the circular layout. Specifically the gap size (15)
#this also determines that topic goes on top half and source on bottom half
circos.par(gap.after = c(rep(5, length(unique(source_topic_relationship[[1]])) - 1), 15,
                         rep(5, length(unique(source_topic_relationship[[2]])) - 1), 15))
#main function that draws the diagram. transparancy goes from 0-1
chordDiagram(source_topic_relationship, grid.col = grid.col, transparency = .2)
title("Relationship Between Topic and Journal Field")

Fit Model 2 K-Means

The structure of the k-means object reveals two important pieces of information: clusters and centers

source_dtm2 <- cast_dtm2
source_tidy2 <- token_words2
#Set a seed for replicable results
set.seed(1234)
k <- 4
kmeansResult <- kmeans(source_dtm2, k)
str(kmeansResult)
## List of 9
##  $ cluster     : Named int [1:5] 3 4 2 1 2
##   ..- attr(*, "names")= chr [1:5] "education" "technology" "engineering" "science" ...
##  $ centers     : num [1:4, 1:4068] 34 17.5 217 99 1 12.5 134 14 27 34.5 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:4068] "learning" "students" "abstract" "design" ...
##  $ totss       : num 132799
##  $ withinss    : num [1:4] 0 2923 0 0
##  $ tot.withinss: num 2923
##  $ betweenss   : num 129876
##  $ size        : int [1:4] 1 2 1 1
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

Intelligence is in all four clusters, but falls mainly in cluster three.

head(kmeansResult$centers[,"intelligence"])
##  1  2  3  4 
## 11  6 21 10

K Means top terms

num_words <- 8 #number of words to display
#get the top words from the kmeans centers
kmeans_topics <- lapply(1:k, function(i) {
  s <- sort(kmeansResult$centers[i, ], decreasing = T)
  names(s)[1:num_words]
})

#make sure it's a data frame
kmeans_topics_df <- as.data.frame(kmeans_topics)
#label the topics with the word Topic
names(kmeans_topics_df) <- paste("Topic", seq(1:k), sep = " ")
#create a sequential row id to use with gather()
kmeans_topics_df <- cbind(id = rownames(kmeans_topics_df),
                          kmeans_topics_df)
kmeans_topics_df
##   id   Topic 1   Topic 2  Topic 3   Topic 4
## 1  1  learning  abstract learning  learning
## 2  2  abstract copyright students  abstract
## 3  3 copyright     users abstract copyright
## 4  4     users    system   design     users
## 5  5     human  learning   system     based
## 6  6   service      user   author      user
## 7  7  articles  multiple    based    author
## 8  8 screening   systems training    design
#transpose it into the format required for word_chart()
kmeans_top_terms <- kmeans_topics_df %>% 
  gather(id)
  colnames(kmeans_top_terms) = c("topic", "term")

kmeans_top_terms <- kmeans_top_terms %>%
  group_by(topic) %>%
  mutate(row = row_number()) %>% #needed by word_chart()
  ungroup()

title <- paste("K-Means Top Terms for", k, "Topics")
word_chart(kmeans_top_terms, kmeans_top_terms$term, title)