library(tidyverse)
library(tidytext)
library(topicmodels)
library(tidyr)
library(dplyr)
library(ggplot2)
library(kableExtra)
library(knitr)
library(ggrepel)
library(gridExtra)
library(formattable)
library(tm)
library(circlize)
library(plotly)
library(wordcloud2)
library(lubridate)
library(stringr)
library(SnowballC)
#set parameters
#define some colors to use throughout
my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00", "#D65E00")
theme_plot <- function(aticks = element_blank(),
pgminor = element_blank(),
lt = element_blank(),
lp = "none")
{
theme(plot.title = element_text(hjust = 0.5), #center the title
axis.ticks = aticks, #set axis ticks to on or off
panel.grid.minor = pgminor, #turn on or off the minor grid lines
legend.title = lt, #turn on or off the legend title
legend.position = lp) #turn on or off the legend
}
#customize the text tables for consistency using HTML formatting
my_kable_styling <- function(dat, caption) {
kable(dat, "html", escape = FALSE, caption = caption) %>%
kable_styling(bootstrap_options = c("striped", "condensed", "bordered"),
full_width = FALSE)
}
word_chart <- function(data, input, title) {
data %>%
#set y = 1 to just plot one variable and use word as the label
ggplot(aes(as.factor(row), 1, label = input, fill = factor(topic) )) +
#you want the words, not the points
geom_point(color = "transparent") +
#make sure the labels don't overlap
geom_label_repel(nudge_x = .2,
direction = "y",
box.padding = 0.1,
segment.color = "transparent",
size = 3) +
facet_grid(~topic) +
theme_plot() +
theme(axis.text.y = element_blank(), axis.text.x = element_blank(),
#axis.title.x = element_text(size = 9),
panel.grid = element_blank(), panel.background = element_blank(),
panel.border = element_rect("lightgray", fill = NA),
strip.text.x = element_text(size = 9)) +
labs(x = NULL, y = NULL, title = title) +
#xlab(NULL) + ylab(NULL) +
#ggtitle(title) +
coord_flip()
}
#read in literature review
review_data2 <-read_csv("data/review_updated_ai.csv")
Let’s inspect articles by published year
hist(review_data2$`Published Year`)
Tidy data by converting to lowercase, and only select abstract, published_year, journal, field and subfield. add a unique identifyer unite document as field.
# convert all variable names to lower case
names(review_data2) <- tolower(names(review_data2))
#Clean Data and include unique identifier
tidy_data2 <- review_data2 %>%
rename(published_year = `published year`)%>%
select(c('abstract', 'published_year', 'journal', 'field', 'subfield')) %>% # only select
mutate(number = row_number())%>%
unite(document, field)
# inspect
tidy_data2
## # A tibble: 137 x 6
## abstract published_year journal document subfield number
## <chr> <dbl> <chr> <chr> <chr> <int>
## 1 "With the rapid g~ 2014 Knowledge-Based ~ ai ai 1
## 2 "Plagiarism refer~ 2017 Engineering Appl~ enginee~ eng_ai 2
## 3 "<bold>Introducti~ 2018 International Jo~ technol~ tech_hea~ 3
## 4 "Spoken dialog sy~ 2015 Neurocomputing technol~ tech_nc 4
## 5 "We designed a wo~ 2015 Early Child Deve~ educati~ edu_early 5
## 6 "• Sequential dee~ 2021 Safety Science enginee~ eng_safe~ 6
## 7 "The goal of this~ 2019 International Jo~ educati~ edu_ai 7
## 8 "A Materials Acce~ 2020 Advanced Science science sci_multi 8
## 9 "• Machine learni~ 2020 Environment Inte~ science sci_env_~ 9
## 10 "Highlights: [•] ~ 2014 Computers in Hum~ technol~ tech_hum~ 10
## # ... with 127 more rows
library(ggplot2)
tidy_data2 %>%
group_by(journal) %>%
summarize(abstract = n_distinct(number)) %>%
ggplot(aes(abstract, journal)) +
geom_col() +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
labs(y = NULL)
#unnest
token_words2 <- tidy_data2 %>%
unnest_tokens(word, abstract) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
token_words2
## # A tibble: 20,135 x 6
## published_year journal document subfield number word
## <dbl> <chr> <chr> <chr> <int> <chr>
## 1 2014 Knowledge-Based Systems ai ai 1 rapid
## 2 2014 Knowledge-Based Systems ai ai 1 growth
## 3 2014 Knowledge-Based Systems ai ai 1 data
## 4 2014 Knowledge-Based Systems ai ai 1 generated
## 5 2014 Knowledge-Based Systems ai ai 1 social
## 6 2014 Knowledge-Based Systems ai ai 1 web
## 7 2014 Knowledge-Based Systems ai ai 1 applications
## 8 2014 Knowledge-Based Systems ai ai 1 paradigms
## 9 2014 Knowledge-Based Systems ai ai 1 generation
## 10 2014 Knowledge-Based Systems ai ai 1 knowledge
## # ... with 20,125 more rows
I also did this using journal as the document and had 137 documents with the same terms.
review_dtm2 <- token_words2 %>%
count(document, word, sort = TRUE) %>%
ungroup()
cast_dtm2 <- review_dtm2 %>%
cast_dtm(document, word, n)
dim(cast_dtm2)
## [1] 5 4068
cast_dtm2
## <<DocumentTermMatrix (documents: 5, terms: 4068)>>
## Non-/sparse entries: 6688/13652
## Sparsity : 67%
## Maximal term length: 24
## Weighting : term frequency (tf)
I can see that “copyright” is showing up in all 5 topics. I might need to go back and remove that word. Also, “author”, “abstract”
#look at 4 documents and 8 words of the DTM
inspect(cast_dtm2[1:5,1:8])
## <<DocumentTermMatrix (documents: 5, terms: 8)>>
## Non-/sparse entries: 40/0
## Sparsity : 0%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs abstract author copyright design learning students system users
## ai 32 11 30 12 15 20 18 24
## education 99 64 48 89 217 134 66 37
## engineering 37 12 37 2 20 5 23 38
## science 27 11 24 4 34 1 7 21
## technology 88 36 73 32 99 14 25 66
#assign the source dataset to generic var names
source_dtm2 <- cast_dtm2
source_tidy2 <- token_words2
We will use the GIBBS sampling method with the default VEM. We will classify documents into Topics based on the mean of gamma for a topic/source.
k <- 5 #number of topics
seed = 1234 #necessary for reproducibility
#fit the model
#you could have more control parameters but will just use seed here
lda <- LDA(source_dtm2, k = k, method = "GIBBS", control = list(seed = seed))
#examine the class of the LDA object
class(lda)
## [1] "LDA_Gibbs"
## attr(,"package")
## [1] "topicmodels"
Extract the per-topic-per-word probabilities, called β from the model and show top 5 results in each topic.
review_topics <- tidy(lda, matrix = "beta")
top_terms <- review_topics %>%
group_by(topic) %>%
slice_max(beta, n = 5) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 25 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 students 0.0154
## 2 1 users 0.0121
## 3 1 tests 0.00870
## 4 1 engineering 0.00774
## 5 1 download 0.00630
## 6 2 bold 0.0113
## 7 2 framework 0.0110
## 8 2 neural 0.00769
## 9 2 game 0.00769
## 10 2 processing 0.00623
## # ... with 15 more rows
num_words <- 10 #number of words to visualize
#create function that accepts the lda model and num word to display
top_terms_per_topic <- function(lda_model, num_words) {
#tidy LDA object to get word, topic, and probability (beta)
topics_tidy <- tidy(lda_model, matrix = "beta")
top_terms <- topics_tidy %>%
group_by(topic) %>%
arrange(topic, desc(beta)) %>%
#get the top num_words PER topic
slice(seq_len(num_words)) %>%
arrange(topic, beta) %>%
#row is required for the word_chart() function
mutate(row = row_number()) %>%
ungroup() %>%
#add the word Topic to the topic labels
mutate(topic = paste("Topic", topic, sep = " "))
#create a title to pass to word_chart
title <- paste("LDA Top Terms for", k, "Topics")
#call the word_chart function you built in prep work
word_chart(top_terms, top_terms$term, title)
}
#call the function you just built!
top_terms_per_topic(lda, num_words)
Show relationship between topic and journal field
tidy_data2
## # A tibble: 137 x 6
## abstract published_year journal document subfield number
## <chr> <dbl> <chr> <chr> <chr> <int>
## 1 "With the rapid g~ 2014 Knowledge-Based ~ ai ai 1
## 2 "Plagiarism refer~ 2017 Engineering Appl~ enginee~ eng_ai 2
## 3 "<bold>Introducti~ 2018 International Jo~ technol~ tech_hea~ 3
## 4 "Spoken dialog sy~ 2015 Neurocomputing technol~ tech_nc 4
## 5 "We designed a wo~ 2015 Early Child Deve~ educati~ edu_early 5
## 6 "• Sequential dee~ 2021 Safety Science enginee~ eng_safe~ 6
## 7 "The goal of this~ 2019 International Jo~ educati~ edu_ai 7
## 8 "A Materials Acce~ 2020 Advanced Science science sci_multi 8
## 9 "• Machine learni~ 2020 Environment Inte~ science sci_env_~ 9
## 10 "Highlights: [•] ~ 2014 Computers in Hum~ technol~ tech_hum~ 10
## # ... with 127 more rows
#using tidy with gamma gets document probabilities into topic
#but you only have document, topic and gamma
source_topic_relationship <- tidy(lda, matrix = "gamma") %>%
#join to orig tidy data bydoc to get the source field
inner_join(tidy_data2, by = "document") %>%
select(document, topic, gamma) %>%
group_by(document, topic) %>%
#get the avg doc gamma value per source/topic
mutate(mean = mean(gamma)) %>%
#remove the gamma value as you only need the mean
select(-gamma) %>%
#removing gamma created duplicates so remove them
distinct()
#relabel topics to include the word Topic
source_topic_relationship$topic = paste("Topic", source_topic_relationship$topic, sep = " ")
circos.clear() #very important! Reset the circular layout parameters
#assign colors to the outside bars around the circle
grid.col = c("education" = my_colors[1],
"science" = my_colors[2],
"ai" = my_colors[3],
"technology" = my_colors[4],
"engineering"= my_colors[5],
"Topic 1" = "grey", "Topic 2" = "grey", "Topic 3" = "grey", "Topic 4" = "grey", "Topic 5" = "grey")
# set the global parameters for the circular layout. Specifically the gap size (15)
#this also determines that topic goes on top half and source on bottom half
circos.par(gap.after = c(rep(5, length(unique(source_topic_relationship[[1]])) - 1), 15,
rep(5, length(unique(source_topic_relationship[[2]])) - 1), 15))
#main function that draws the diagram. transparancy goes from 0-1
chordDiagram(source_topic_relationship, grid.col = grid.col, transparency = .2)
title("Relationship Between Topic and Journal Field")
The structure of the k-means object reveals two important pieces of information: clusters and centers
source_dtm2 <- cast_dtm2
source_tidy2 <- token_words2
#Set a seed for replicable results
set.seed(1234)
k <- 4
kmeansResult <- kmeans(source_dtm2, k)
str(kmeansResult)
## List of 9
## $ cluster : Named int [1:5] 3 4 2 1 2
## ..- attr(*, "names")= chr [1:5] "education" "technology" "engineering" "science" ...
## $ centers : num [1:4, 1:4068] 34 17.5 217 99 1 12.5 134 14 27 34.5 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:4068] "learning" "students" "abstract" "design" ...
## $ totss : num 132799
## $ withinss : num [1:4] 0 2923 0 0
## $ tot.withinss: num 2923
## $ betweenss : num 129876
## $ size : int [1:4] 1 2 1 1
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
Intelligence is in all four clusters, but falls mainly in cluster three.
head(kmeansResult$centers[,"intelligence"])
## 1 2 3 4
## 11 6 21 10
num_words <- 8 #number of words to display
#get the top words from the kmeans centers
kmeans_topics <- lapply(1:k, function(i) {
s <- sort(kmeansResult$centers[i, ], decreasing = T)
names(s)[1:num_words]
})
#make sure it's a data frame
kmeans_topics_df <- as.data.frame(kmeans_topics)
#label the topics with the word Topic
names(kmeans_topics_df) <- paste("Topic", seq(1:k), sep = " ")
#create a sequential row id to use with gather()
kmeans_topics_df <- cbind(id = rownames(kmeans_topics_df),
kmeans_topics_df)
kmeans_topics_df
## id Topic 1 Topic 2 Topic 3 Topic 4
## 1 1 learning abstract learning learning
## 2 2 abstract copyright students abstract
## 3 3 copyright users abstract copyright
## 4 4 users system design users
## 5 5 human learning system based
## 6 6 service user author user
## 7 7 articles multiple based author
## 8 8 screening systems training design
#transpose it into the format required for word_chart()
kmeans_top_terms <- kmeans_topics_df %>%
gather(id)
colnames(kmeans_top_terms) = c("topic", "term")
kmeans_top_terms <- kmeans_top_terms %>%
group_by(topic) %>%
mutate(row = row_number()) %>% #needed by word_chart()
ungroup()
title <- paste("K-Means Top Terms for", k, "Topics")
word_chart(kmeans_top_terms, kmeans_top_terms$term, title)