Introductory Data Science for Innovation

Group Exercise: Text Analysis of SPRU PhD Data

Dmitry Malkov, Ryan Lee, Tong Kowintasate

24 March 2020

Required libraries

library(tidytext)
library(dplyr)
library(readr)
library(stringr)
library(ggplot2)
library(tidyverse)
library(ggthemes)
library(magrittr)
require(scales)
library(data.table)
library(igraph)
library(ggraph)

Creating a heatmap for most commonly used bigrams among SPRU PhD supervisors

Loading the data set

setwd("~/Downloads/data_seminar/spru_phd") 
spru_data <- read_csv("spru_phd.csv") %>%
  select (- Firstname)
colnames(spru_data)
## [1] "Lastname"    "Year"        "Title"       "Abstract"    "Supervisors"

Loading general and custom stop words

data(stop_words)
custom_stop_words <- c("thesis", "examine[[:alpha:]]*", "argue[:alpha:]*", "investig[[:alpha:]]*", 
                       "analys[[:alpha:]]*", "contribut[[:alpha:]]*", "research[[:alpha:]]*", "empirical",
                       "framework", "supervisor", "academic", "chapter", "study")

Creating bigrams and tidying the dataset

bi_spru <- spru_data %>%
  unnest_tokens(bigram, Abstract, token = "ngrams", n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  mutate(Supervisors = strsplit(as.character(Supervisors), ";")) %>% 
  unnest(Supervisors) %>%
  filter(!word1 %in% stop_words$word, !word1 %in% custom_stop_words,
         !word2 %in% stop_words$word, !word2 %in% custom_stop_words)
bi_spru$bigram <- paste(bi_spru$word1, bi_spru$word2)
bi_spru <- select(bi_spru, -word1, -word2) %>%
  count(Supervisors, bigram, sort = TRUE) 
head(bi_spru)
## # A tibble: 6 x 3
##   Supervisors          bigram                         n
##   <chr>                <chr>                      <int>
## 1 "RMN Bell"           technical change              19
## 2 " GN Von Tunzelmann" technological capabilities    18
## 3 "RMN Bell"           developing countries          18
## 4 "RMN Bell"           technological capability      18
## 5 "GN Von Tunzelmann"  technical change              17
## 6 "KLR Pavitt"         technical change              17

Selecting a subsample

bi_sample <- bi_spru %>%
  filter(n > 10)

Creating cluster matrix for heatmap visualisation (Bigrams - Supervisors)

cluster_matrix <- bi_sample %>% 
  group_by(bigram,Supervisors,n) %>% 
  spread(Supervisors,n)
cluster_matrix[is.na(cluster_matrix)] <- 0
dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame
row.names(dat) <- cluster_matrix$bigram
row.order <- hclust(dist(dat))$order
col.order <- hclust(dist(t(dat)))$order
dat_new <- dat[row.order, col.order]
cluster_matrix <- melt(as.matrix(dat_new))
names(cluster_matrix) <- c("Bigram", "Supervisor","Freq")

Plotting the heatmap (Bigrams - Supervisors)

ggplot(cluster_matrix,aes(x = Supervisor,
                          y = Bigram,
                          fill = Freq,
                          label = Freq)) + 
  geom_tile() + scale_fill_viridis_c() + 
  geom_text(color="#FFFFFF",size=2) +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Most Used Bigrams")

Creating an alternative heatmap for most used unigrams across all available years

Creating unigrams

uni_spru_year <- spru_data %>%
  unnest_tokens(word, Abstract) %>% 
  filter(!word %in% stop_words$word, !word %in% custom_stop_words) %>%
  count(Year, word, sort = TRUE) 

Creating a subsample

uni_sample_year <- uni_spru_year %>%
  filter(n > 25)

uni_sample_year
## # A tibble: 46 x 3
##     Year word              n
##    <dbl> <chr>         <int>
##  1  1994 technology       51
##  2  1998 technological    50
##  3  2000 firms            48
##  4  2000 research         46
##  5  1994 firms            41
##  6  1996 technology       41
##  7  2000 capabilities     41
##  8  2010 policy           41
##  9  1993 policy           40
## 10  1996 firms            39
## # … with 36 more rows

Creating cluster matrix for heatmap visualisation (Unigrams - Year)

cluster_matrix3 <- uni_sample_year %>% 
  group_by(word,Year,n) %>% 
  spread(Year,n)
cluster_matrix3[is.na(cluster_matrix3)] <- 0
dat <- cluster_matrix3[,2:(ncol(cluster_matrix3))] %>% as.data.frame
row.names(dat) <- cluster_matrix3$word
row.order <- hclust(dist(dat))$order
col.order <- hclust(dist(t(dat)))$order
dat_new <- dat[row.order, col.order]
cluster_matrix3 <- melt(as.matrix(dat_new))
names(cluster_matrix3) <- c("Word", "Year","Freq")

Plotting the heatmap (Unigrams - Year)

ggplot(cluster_matrix3, aes(x = as.character(Year),
                            y = Word,
                            fill = Freq,
                            label = Freq)) + 
  geom_tile() + scale_fill_viridis_c() + 
  geom_text(color="#FFFFFF",size=2) +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Most Used Words")

Co-word analysis

Modifying the data set

uni_spru_sup <- spru_data %>%
  unnest_tokens(word, Abstract) %>% 
  mutate(Supervisors = strsplit(as.character(Supervisors), ";")) %>% 
  unnest(Supervisors) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!word %in% custom_stop_words) %>%
  count(Supervisors, word, sort = TRUE) 

Selecting a subsample

coword_sample_sup <- uni_spru_sup %>%
  filter(n > 15)
coword_sample_sup
## # A tibble: 328 x 3
##    Supervisors       word              n
##    <chr>             <chr>         <int>
##  1 RMN Bell          technological   130
##  2 KLR Pavitt        technological   123
##  3 BR Martin         research        122
##  4 RMN Bell          technology       93
##  5 RMN Bell          firms            90
##  6 KLR Pavitt        firms            83
##  7 GN Von Tunzelmann firms            79
##  8 GN Von Tunzelmann technological    76
##  9 RMN Bell          development      74
## 10 GN Von Tunzelmann growth           70
## # … with 318 more rows

Assigning node types to Supervisors (True) and Words (False)

node1 <- coword_sample_sup %>%
  distinct(Supervisors) %>%
  rename(node = Supervisors) %>%
  mutate(type = T)

node2 <- coword_sample_sup %>%
  distinct(word) %>%
  rename(node = word) %>%
  mutate(type = F)

Binding Supervisor and Unigram nodes

nodes <- bind_rows(node1, node2)

graph_coword <- graph_from_data_frame(coword_sample_sup, vertices = nodes, directed = F)
graph_coword
## IGRAPH 66adbaa UN-B 175 328 -- 
## + attr: name (v/c), type (v/l), n (e/n)
## + edges from 66adbaa (vertex names):
##  [1] RMN Bell         --technological KLR Pavitt       --technological
##  [3] BR Martin        --research      RMN Bell         --technology   
##  [5] RMN Bell         --firms         KLR Pavitt       --firms        
##  [7] GN Von Tunzelmann--firms         GN Von Tunzelmann--technological
##  [9] RMN Bell         --development   GN Von Tunzelmann--growth       
## [11] RMN Bell         --industry      RMN Bell         --capabilities 
## [13] GS MacKerron     --energy        GN Von Tunzelmann--innovation   
## [15] KLR Pavitt       --technology    KLR Pavitt       --innovation   
## + ... omitted several edges

Visualising the network of words frequently used by SPRU PhD supervisors

coword_vis <- ggraph(graph_coword, layout = "fr") +
  geom_edge_link0(edge_colour = "grey") +
  geom_node_point(aes(shape = type, color =  type), size = 2.5) +
  theme_graph()
coword_vis