library(tidytext)
library(dplyr)
library(readr)
library(stringr)
library(ggplot2)
library(tidyverse)
library(ggthemes)
library(magrittr)
require(scales)
library(data.table)
library(igraph)
library(ggraph)
setwd("~/Downloads/data_seminar/spru_phd")
spru_data <- read_csv("spru_phd.csv") %>%
select (- Firstname)
colnames(spru_data)
## [1] "Lastname" "Year" "Title" "Abstract" "Supervisors"
data(stop_words)
custom_stop_words <- c("thesis", "examine[[:alpha:]]*", "argue[:alpha:]*", "investig[[:alpha:]]*",
"analys[[:alpha:]]*", "contribut[[:alpha:]]*", "research[[:alpha:]]*", "empirical",
"framework", "supervisor", "academic", "chapter", "study")
bi_spru <- spru_data %>%
unnest_tokens(bigram, Abstract, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
mutate(Supervisors = strsplit(as.character(Supervisors), ";")) %>%
unnest(Supervisors) %>%
filter(!word1 %in% stop_words$word, !word1 %in% custom_stop_words,
!word2 %in% stop_words$word, !word2 %in% custom_stop_words)
bi_spru$bigram <- paste(bi_spru$word1, bi_spru$word2)
bi_spru <- select(bi_spru, -word1, -word2) %>%
count(Supervisors, bigram, sort = TRUE)
head(bi_spru)
## # A tibble: 6 x 3
## Supervisors bigram n
## <chr> <chr> <int>
## 1 "RMN Bell" technical change 19
## 2 " GN Von Tunzelmann" technological capabilities 18
## 3 "RMN Bell" developing countries 18
## 4 "RMN Bell" technological capability 18
## 5 "GN Von Tunzelmann" technical change 17
## 6 "KLR Pavitt" technical change 17
bi_sample <- bi_spru %>%
filter(n > 10)
cluster_matrix <- bi_sample %>%
group_by(bigram,Supervisors,n) %>%
spread(Supervisors,n)
cluster_matrix[is.na(cluster_matrix)] <- 0
dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame
row.names(dat) <- cluster_matrix$bigram
row.order <- hclust(dist(dat))$order
col.order <- hclust(dist(t(dat)))$order
dat_new <- dat[row.order, col.order]
cluster_matrix <- melt(as.matrix(dat_new))
names(cluster_matrix) <- c("Bigram", "Supervisor","Freq")
ggplot(cluster_matrix,aes(x = Supervisor,
y = Bigram,
fill = Freq,
label = Freq)) +
geom_tile() + scale_fill_viridis_c() +
geom_text(color="#FFFFFF",size=2) +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Most Used Bigrams")
uni_spru_year <- spru_data %>%
unnest_tokens(word, Abstract) %>%
filter(!word %in% stop_words$word, !word %in% custom_stop_words) %>%
count(Year, word, sort = TRUE)
uni_sample_year <- uni_spru_year %>%
filter(n > 25)
uni_sample_year
## # A tibble: 46 x 3
## Year word n
## <dbl> <chr> <int>
## 1 1994 technology 51
## 2 1998 technological 50
## 3 2000 firms 48
## 4 2000 research 46
## 5 1994 firms 41
## 6 1996 technology 41
## 7 2000 capabilities 41
## 8 2010 policy 41
## 9 1993 policy 40
## 10 1996 firms 39
## # … with 36 more rows
cluster_matrix3 <- uni_sample_year %>%
group_by(word,Year,n) %>%
spread(Year,n)
cluster_matrix3[is.na(cluster_matrix3)] <- 0
dat <- cluster_matrix3[,2:(ncol(cluster_matrix3))] %>% as.data.frame
row.names(dat) <- cluster_matrix3$word
row.order <- hclust(dist(dat))$order
col.order <- hclust(dist(t(dat)))$order
dat_new <- dat[row.order, col.order]
cluster_matrix3 <- melt(as.matrix(dat_new))
names(cluster_matrix3) <- c("Word", "Year","Freq")
ggplot(cluster_matrix3, aes(x = as.character(Year),
y = Word,
fill = Freq,
label = Freq)) +
geom_tile() + scale_fill_viridis_c() +
geom_text(color="#FFFFFF",size=2) +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Most Used Words")
uni_spru_sup <- spru_data %>%
unnest_tokens(word, Abstract) %>%
mutate(Supervisors = strsplit(as.character(Supervisors), ";")) %>%
unnest(Supervisors) %>%
anti_join(stop_words, by = "word") %>%
filter(!word %in% custom_stop_words) %>%
count(Supervisors, word, sort = TRUE)
coword_sample_sup <- uni_spru_sup %>%
filter(n > 15)
coword_sample_sup
## # A tibble: 328 x 3
## Supervisors word n
## <chr> <chr> <int>
## 1 RMN Bell technological 130
## 2 KLR Pavitt technological 123
## 3 BR Martin research 122
## 4 RMN Bell technology 93
## 5 RMN Bell firms 90
## 6 KLR Pavitt firms 83
## 7 GN Von Tunzelmann firms 79
## 8 GN Von Tunzelmann technological 76
## 9 RMN Bell development 74
## 10 GN Von Tunzelmann growth 70
## # … with 318 more rows
node1 <- coword_sample_sup %>%
distinct(Supervisors) %>%
rename(node = Supervisors) %>%
mutate(type = T)
node2 <- coword_sample_sup %>%
distinct(word) %>%
rename(node = word) %>%
mutate(type = F)
nodes <- bind_rows(node1, node2)
graph_coword <- graph_from_data_frame(coword_sample_sup, vertices = nodes, directed = F)
graph_coword
## IGRAPH 66adbaa UN-B 175 328 --
## + attr: name (v/c), type (v/l), n (e/n)
## + edges from 66adbaa (vertex names):
## [1] RMN Bell --technological KLR Pavitt --technological
## [3] BR Martin --research RMN Bell --technology
## [5] RMN Bell --firms KLR Pavitt --firms
## [7] GN Von Tunzelmann--firms GN Von Tunzelmann--technological
## [9] RMN Bell --development GN Von Tunzelmann--growth
## [11] RMN Bell --industry RMN Bell --capabilities
## [13] GS MacKerron --energy GN Von Tunzelmann--innovation
## [15] KLR Pavitt --technology KLR Pavitt --innovation
## + ... omitted several edges
Visualising the network of words frequently used by SPRU PhD supervisors
coword_vis <- ggraph(graph_coword, layout = "fr") +
geom_edge_link0(edge_colour = "grey") +
geom_node_point(aes(shape = type, color = type), size = 2.5) +
theme_graph()
coword_vis