1 Setup

Filename: mig2830.Rmd

1.1 Libraries

Sys.time()

## [1] "2023-08-30 22:59:46 EDT"

#getwd()
# Update here for each folder
directory <- "wos_mig"
folder <- paste(getwd(), directory, sep = "/") 
library(readxl)
library(writexl)
library(DT)
library(tidyr)
library(dplyr) # mutate
library(ggplot2)
library(stringr) # str_extract(x, pattern)
# Pre-selected columns from Web of Science saved records 
wos_cols <- scan(file = "wos_cols.txt", what = character(), sep = '\t')
wos_cols

##  [1] "DOI"                   "Author Full Names"     "Article Title"        
##  [4] "Source Title"          "Author Keywords"       "Keywords Plus"        
##  [7] "Abstract"              "Times Cited, WoS Core" "Publication Year"     
## [10] "WoS Categories"

1.2 User-Defined Functions

1.2.1 Simple Frequency Table with Percentage

User-defined functions: freq_tables and print_freq_tables for simple frequency tables

# Generate a descending frequency table from a variable
freq_tables <- function(var) {
  freq_table <- data.frame(table(var))
  freq_table <- freq_table[rev(order(freq_table$Freq)),]
  freq_table$cumsum <- cumsum(freq_table$Freq)
  freq_table$prop <- freq_table$Freq / length(var)
  freq_table$cum <- cumsum(freq_table$prop)
  freq_table$pct <- paste(round(freq_table$prop*100, 2), "%", sep = "")
  freq_table$cumpct <- paste(round(freq_table$cum*100, 2), "%", sep = "")
  return(freq_table)
}
# Print out a descending frequency table from a variable
print_freq_tables <- function(var) {
  freq_table <- data.frame(table(var))
  freq_table <- freq_table[rev(order(freq_table$Freq)),]
  freq_table$cumsum <- cumsum(freq_table$Freq)
  freq_table$prop <- freq_table$Freq / length(var)
  freq_table$cum <- cumsum(freq_table$prop)
  freq_table$pct <- paste(round(freq_table$prop*100, 2), "%", sep = "")
  freq_table$cumpct <- paste(round(freq_table$cum*100, 2), "%", sep = "")
  rownames(freq_table) <- seq_along(1:nrow(freq_table))
  return(datatable(freq_table[,c("var","Freq","cumsum","pct","cumpct")]))
}

1.2.2 Frequency Table for Cross-listings (customized total)

Frequency table of cross-listed categories with a controlled total that is not the total count.

## Frequency table of cross-listed categories with a controlled total that is not the total count.
## This function will NOT return cumulative sum because when the total count do not equal to the actual total, the cumulative sum does not make much sense.
## The default value is the total count. When there is no cross-listing, the denominator is the total count of observations by default.
############### ADD CONDITIONAL PROCESSING FOR MORE ROBUST FUNCTIONS ####################
freq_crosslist <- function(var, denominator = length(var)) {
  if (length(var) > 1) {
    fq_crlt <- data.frame(table(var))
    fq_crlt <- fq_crlt[rev(order(fq_crlt$Freq)), ]
    fq_crlt$prop <- fq_crlt$Freq / denominator
    names(fq_crlt) <- c("Term", "Frequency", "Share")
    fq_crlt$Term <-
      as.character(fq_crlt$Term) # save the Term column as characters from factors, to avoid auto-recoding
    fq_crlt$Share <- as.numeric(fq_crlt$Share)
    rownames(fq_crlt) <- NULL
    return(fq_crlt) ## CONDITIONAL STATEMENT
  } else if (length(var) == 1) {
    fq_crlt <- data.frame(matrix(ncol = 3, nrow = 1))
    colnames(fq_crlt) <- c("Term", "Frequency", "Share")
    fq_crlt$Term <- var
    fq_crlt$Frequency <- 1
    fq_crlt$Share <- 1
    return(fq_crlt) ## CONDITIONAL STATEMENT
  } else if (length(var) < 1) {
    fq_crlt <- data.frame(matrix(ncol = 3, nrow = 1))
    colnames(fq_crlt) <- c("Term", "Frequency", "Share")
    fq_crlt$Term <- "Place_Holder"
    fq_crlt$Frequency <- 0
    fq_crlt$Share <- 0
    print("The expected multi-element vector has no elements, a place holder row with `Frequency = 0` is introduced.")
    return(fq_crlt)}
}


#############THE FOLLOWING DID NOT HAVE THE ROBUST FEATURES AS THE ABOVE.
#############THE ROBUST FEATURES ARE NOT NEEDED BECAUSE LOOP TO DISPLAY MULTIPLE TABLES DOESN'T WORK.

## Compared to freq_crosslist, the following display-only function creates the percent column with 2 decimal places.
print_fq_crlt <- function(var, denominator = length(var), header = default_header) {
  ## copy from the freq_crosslist function
  fq_crlt <- data.frame(table(var))
  fq_crlt <- fq_crlt[rev(order(fq_crlt$Freq)),]
  fq_crlt$prop <- fq_crlt$Freq / denominator
  names(fq_crlt) <- c("Term","Frequency","Share")
  fq_crlt$Term <- as.character(fq_crlt$Term) # save the Term column as characters from factors, to avoid auto-recoding
  fq_crlt$Share <- as.numeric(fq_crlt$Share)
  rownames(fq_crlt) <- NULL
  fq_crlt$Percent <- paste(round(100*fq_crlt$Share, 2), "%", sep = "")
  ## the above is copied from freq_crosslist
  default_header = paste("Frequency Table", "(including cross-listed items)")
  return(datatable(fq_crlt[, !colnames(fq_crlt) %in% c("Share")], caption = header))
}

## Optional next step, challenge a loop that iteratively process multiple frequency tables, indexing from a list of variables

1.2.3 Word Tokenize & N-grams

I also wrote a simple tokenizing function str_tokenize, inspired by Python nltk. Another very handy function is word_count. Then, I added bi_gram() and tri_gram() functions to simplify my work.

str_tokenize <- function(text, split){
  if(class(text) == "character") {
    my_token <- unlist(strsplit(text, split = split))
    names(my_token) <- seq_along(my_token)
    return(my_token)
  } else stop("The input text string class must be in character.")
} # added on May 6, 2023

word_count <- function(text, split = " ") {
  if(class(text) == "character") {
    sapply(strsplit(text, split = split), length)
  } else stop("The input text string class must be in character.")
}

## The following n-gram functions return a vector of n-grams from a text body. Input text must be tokenized.
bi_gram <- function(token) {
  if (length(token) >= 3) {
    bi_gram <- c()
    for (i in 1:(length(token) - 1)) {
      bi_gram[i] <- paste(token[i], token[i + 1])
    }
    return(bi_gram)
  } else
    stop("The input text must be TOKENIZED and has length greater than 2.")
}

tri_gram <- function(token) {
  if (length(token) >= 4) {
  tri_gram <- c()
  for (i in 1: (length(token) - 2 )) {
  tri_gram[i] <- paste(token[i], token[i+1], token[i+2])
  }
  return(tri_gram)
  } else stop("The input text must be TOKENIZED and has length greater than 3.")
}

2 Data

TS = (skilled migrat* OR h1b OR h1-b OR h-1b OR skilled immigra*)

2.1 Search Terms

TS = (skilled migrat* OR h1b OR h1-b OR h-1b OR skilled immigra*)

2.2 Read-in Raw Data

List all files in the directory

list.files(path = folder)

## [1] "savedrecs1_1000.xlsx"    "savedrecs1001_2000.xlsx"
## [3] "savedrecs2001_2830.xlsx"

filenames <- list.files(path = folder)
# Only read in excel files
filenames <- filenames[grepl(pattern = ".xlsx", filenames)]

2.3 Prepare Data

Read in every eligible file in the directory. Make sure the data structure is the same, and then join them horizontally

nlist <- vector("list", length(filenames))
# Read-in all eligible files from directory
# Make sure to double wrap a list element with [[]]
for (n in 1: length(filenames)) {
  nlist[[n]] <- read_excel(path = paste(folder, filenames[n], sep = "/"), sheet = "savedrecs")
}
# Concatenate multiple dataframes into one single dataframe. 
# Do this outside the loop to keep the original column names
dat_raw <- do.call(what = "rbind", lapply(nlist, as.data.frame))
nrow(dat_raw)

## [1] 2830

# Keep Article, Proceedings Paper
table(dat_raw$`Document Type`)

## 
##                    Article      Article; Book Chapter 
##                       2542                          3 
##      Article; Early Access Article; Proceedings Paper 
##                         84                         72 
##                Book Review                 Correction 
##                         31                          2 
##   Correction; Early Access         Editorial Material 
##                          1                         41 
##                     Letter           Meeting Abstract 
##                          4                          2 
##                  News Item                       Note 
##                          2                          1 
##                     Review       Review; Early Access 
##                         44                          1

dat_temp <- dat_raw[grepl(pattern = ("Article|Proceedings Paper"), dat_raw$`Document Type`),]
nrow(dat_temp)

## [1] 2701

table(dat_temp$`Document Type`)

## 
##                    Article      Article; Book Chapter 
##                       2542                          3 
##      Article; Early Access Article; Proceedings Paper 
##                         84                         72

# Remove rows with empty publication years
dat_temp <- dat_temp[!is.na(dat_temp$`Publication Year`),]
nrow(dat_temp)

## [1] 2617

table(dat_temp$`Publication Year`)

## 
## 1957 1973 1977 1985 1986 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 
##    1    1    3    1    2    4    1    3    4    8    6    9    8   14   15   22 
## 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 
##   13   34   24   19   29   37   37   40   57   66   76  109  116  104  130  139 
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 
##  148  147  171  167  194  203  209  195   51

# Select columns based on pre-determined variables. Make sure the format of column names are consistent.
dat <- dat_temp[,names(dat_temp) %in% wos_cols]
# validate that all wanted columns are included, expecting all TRUEs
names(dat) %in% wos_cols

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

dat <- dat[order(dat$`Publication Year`),]

2.3.1 Missing Values

colSums(is.na(dat))

##     Author Full Names         Article Title          Source Title 
##                     0                     0                     0 
##       Author Keywords         Keywords Plus              Abstract 
##                   780                   379                    37 
## Times Cited, WoS Core      Publication Year                   DOI 
##                     0                     0                   116 
##        WoS Categories 
##                     1

Many entries don’t DOI information. Need other column as case identifier

# Which is the one that doesn't have WoS Category?
dat$`Source Title`[is.na(dat$`WoS Categories`)]

## [1] "PHILIPPINE POLITICAL SCIENCE JOURNAL"

# This can be imputed. Since the source journal is self explanatory
dat$`WoS Categories`[dat$`Source Title` == "PHILIPPINE POLITICAL SCIENCE JOURNAL"] <- "Political Science"
# Check missing values again
colSums(is.na(dat))

##     Author Full Names         Article Title          Source Title 
##                     0                     0                     0 
##       Author Keywords         Keywords Plus              Abstract 
##                   780                   379                    37 
## Times Cited, WoS Core      Publication Year                   DOI 
##                     0                     0                   116 
##        WoS Categories 
##                     0

Are the missing keywords from specific journals?

print_freq_tables(dat$`Source Title`[is.na(dat$`Author Keywords`) & is.na(dat$`Keywords Plus`)])

# International Migration has many missing keywords

Are the missing keywords entries from specific years?

print_freq_tables(dat$`Publication Year`[is.na(dat$`Author Keywords`) & is.na(dat$`Keywords Plus`)])

2.3.2 Case ID

Because not all entries have DOI, a hand-made case ID column is needed.

# last two-digit of publication year
yr <- substr(dat$`Publication Year`, start = 3, stop = 4)
# numbering rows within groups
dat <- dat %>% group_by(`Publication Year`) %>% mutate(count = row_number())
# concatenate a id column
id <- paste(yr, dat$count, sep = "_")
# add id columns
dat$id <- id
dat$id2 <- 1:nrow(dat)

## ids for 1991-2020
id_1991_2020 <- dat$id[dat$`Publication Year`>=1991 & dat$`Publication Year` <=2020]
## ids for 1991-2022
id_1991_2022 <- dat$id[dat$`Publication Year`>=1991 & dat$`Publication Year` <=2022]

# drop count and DOI columns
dat <- dat[, !names(dat) %in% c("DOI","count")]
# activate the following
#dat <- dat[dat$`Publication Year` <2023, ]

Create ids for each decade.

id80 <- dat$id[dat$`Publication Year` <= 1990]
id90 <- dat$id[dat$`Publication Year` >= 1991 & dat$`Publication Year` <= 2000]
id00 <- dat$id[dat$`Publication Year` >= 2001 & dat$`Publication Year` <= 2010]
id10 <- dat$id[dat$`Publication Year` >= 2011 & dat$`Publication Year` <= 2020]
id20 <- dat$id[dat$`Publication Year` >= 2021 & dat$`Publication Year` <= 2030]

2.3.3 Keywords

Concatenate keywords

# assign missing keyword columns to 99
dat$`Author Keywords`[is.na(dat$`Author Keywords`)] <- '99'
dat$`Keywords Plus`[is.na(dat$`Keywords Plus`)] <- '99'
# concatenate two keyword columns into one column
keywords <- paste(dat$`Author Keywords`, dat$`Keywords Plus`, sep = ";")
# add the concatenated keyword column to the main data
dat$keywords <- keywords
# replace the place-holder '99's to ''
dat$keywords <- gsub(pattern = '99;99|99|99;|:99', replacement = '', dat$keywords)
# change case to upper
dat$keywords <- toupper(dat$keywords)
# note the blank cells are not NAs, take care of it later
sum(is.na(dat$keywords))

## [1] 0

# drop the original keyword columns
dat <- dat[, !names(dat) %in% c("Author Keywords","Keywords Plus")]
# Percent of missing rows
nrow(dat[nchar(dat$keywords)<1,])

## [1] 165

nrow(dat[nchar(dat$keywords)>=1,])/nrow(dat)

## [1] 0.9369507

nrow(dat[nchar(dat$keywords)<1,])/nrow(dat)

## [1] 0.06304929

2.4 Save Data

# Prepare a readme file that provides information of this data
readme <- data.frame(cbind(
  source = "Web of Science",
  search = "TS = (skilled migrat* OR h1b OR h1-b OR h-1b OR skilled immigra*)",
  date = as.character(as.Date(Sys.Date())),
  notes = "re-download after meeting with Dr. Morcol on 04/24/2023 to include all entries regardless of their discipline"
))
# Adaptive file name
data_name = paste(directory, nrow(dat_raw), ".xlsx", sep = "")
write_xlsx(list("readme" = readme, "data" = dat, "raw" = dat_raw), data_name)

2.5 Re-load Data

dat <- read_excel(path = data_name, sheet = "data")
dat <- dat[order(dat$`Publication Year`),]
# when reload the data, the blank entries become missing
colSums(is.na(dat))

3 Text Preprocessing

library(textstem) # lemmatize_strings
library(stopwords)
library(tm) # removePunctuation

3.1 Before (raw text)

## Step 1: fill NA abstract with ""
dat$Abstract[is.na(dat$Abstract)] <- ''
paste(dat$`Article Title`[100], dat$Abstract[100], sep = ' ')

## [1] "New migrations in the Asia-Pacific region: a force for social and political change A rapid increase in international migration is a central aspect of the social transformations currently taking place in the Asia-Pacific region. Population movements take many forms, including permanent migration, temporary labour migration, mobility of highly skilled personnel, refugee movements and family reunion. Destinations include North America, the Gulf oil states and - increasingly - the fast-growing 'tiger economies' of Asia. Much of the migration is undocumented and a growing proportion of the migrants are women. So far, researchers and policy-makers have concentrated on short-term economic and regulatory aspects. But migration is likely to be a major factor bringing about social and political change in the region. The social networks which develop as part of the migratory process often make official migration control policies difficult to implement. Unplanned settlement is taking place, with important consequences for both sending and receiving societies. Scholars from a number of countries in the region have therefore established an Asia Pacific Migration Research Network to study these issues, to raise public awareness and to provide advice to policymakers. The article describes the aims and development of this Network, which is part of the UNESCO Management of Social Transformations Programme."

## Step 2: Concatenate article title and abstract
maintext_raw = tolower(paste(dat$`Article Title`, dat$Abstract, sep = ' '))
## Step 3: replace hyphen with space
maintext <- gsub('-',' ', maintext_raw)
## Step 4: Lemmatize
lemm <- lemmatize_strings(maintext)
## Step 5: remove punctuation
no_punct <- removePunctuation(lemm)
## Step 6: Remove stopwords and tokenize (manual tokenize using unlist and strsplit)
final_text <- c()
for (i in 1:length(no_punct)) {
  final_text[i] <- paste(
  unlist(strsplit(no_punct[i], split = ' '))[!unlist(strsplit(no_punct[i], split = ' ')) %in% stopwords::stopwords()], collapse = ' '
)
}
final_text <- gsub("\\s+", " ", final_text) # remove excessive white space
dat$text <- final_text

3.2 After (processed text)

Show the same text string after preprocessing

dat$text[100]

## [1] "new migration asia pacific region force social political change rapid increase international migration central aspect social transformation currently take place asia pacific region population movement take many form include permanent migration temporary labour migration mobility highly skill personnel refugee movement family reunion destination include north america gulf oil state increasingly fast grow tiger economy asia much migration undocumented grow proportion migrant woman far researcher policy maker concentrate short term economic regulatory aspect migration likely major factor bring social political change region social network develop part migratory process often make official migration control policy difficult implement unplanned settlement take place important consequence send receive society scholar numb country region therefore establish asia pacific migration research network study issue raise public awareness provide advice policymaker article describe aim development network part unesco management social transformation programme"

4 Analysis

Start from discipline and longitudinal patterns, and then split all the entries by decade

4.1 Discipline

Continue working with a smaller number of columns

names(dat)

##  [1] "Author Full Names"     "Article Title"         "Source Title"         
##  [4] "Abstract"              "Times Cited, WoS Core" "Publication Year"     
##  [7] "WoS Categories"        "id"                    "id2"                  
## [10] "keywords"              "text"

year_cat <- separate_rows(dat[,c("Publication Year", "WoS Categories")], "WoS Categories", sep = ";", convert = TRUE)
dim(year_cat)

## [1] 4201    2

# Trim white space
year_cat$`WoS Categories` <- trimws(year_cat$`WoS Categories`)

What is the distribution by discipline?

cat_freq <- data.frame(table(year_cat$`WoS Categories`))
cat_freq$prop <- cat_freq$Freq/length(dat$id)
cat_freq <- cat_freq[rev(order(cat_freq$Freq)),]
sum(cat_freq$prop) # > 1 because of one publication can belong to more disciplines

## [1] 1.605273

cat_freq$pct <- paste(round(cat_freq$prop,4)*100,"%",sep = "")
rownames(cat_freq) <- 1:nrow(cat_freq)
datatable(cat_freq)

How many publications have more than one discipline?

# Publications that belong to more than 1 discipline
length(grep(pattern = ";", dat$`WoS Categories`))

## [1] 1203

# In % expression
paste(100*length(grep(pattern = ";", dat$`WoS Categories`))/nrow(dat),"%")

## [1] "45.968666411922 %"

# Frequency table of all presented discipline combinations
print_freq_tables(dat$`WoS Categories`)

What is the share of PLSC/PADM in this universe?

# Index the row numbers of PLSC and PADM publications
plsc_padm_i <- grep(pattern = "Political Science|Public Administration", dat$`WoS Categories`)
# How many publications belong either to PLSC and PADM
length(plsc_padm_i)

## [1] 146

# the share of PLSC and PADM in %
paste(100*length(plsc_padm_i)/nrow(dat),"%")

## [1] "5.57890714558655 %"

Which journals publish the most in PLSC/PADM?

print_freq_tables(dat$`Source Title`[plsc_padm_i])

top_plsc_padm_j <- freq_tables(dat$`Source Title`[plsc_padm_i])$var[1]
# Which discipline does the top PLSC/PADM journal belong to?
unique(dat$`WoS Categories`[dat$`Source Title` == top_plsc_padm_j])

## [1] "Economics; Public Administration"

4.1.1 Journal distribution

Econ

econ_i <- grep(pattern = "Economics", dat$`WoS Categories`)
length(econ_i)

## [1] 880

print_freq_tables(dat$`Source Title`[econ_i])

Demo

demo_i <- grep(pattern = "Demography", dat$`WoS Categories`)
length(demo_i)

## [1] 510

print_freq_tables(dat$`Source Title`[demo_i])

Which are the most “productive” journals?

print_freq_tables(dat$`Source Title`)

4.2 Year

table(dat$`Publication Year`)

## 
## 1957 1973 1977 1985 1986 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 
##    1    1    3    1    2    4    1    3    4    8    6    9    8   14   15   22 
## 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 
##   13   34   24   19   29   37   37   40   57   66   76  109  116  104  130  139 
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 
##  148  147  171  167  194  203  209  195   51

year_ct <- dat[dat$`Publication Year`>1990 & dat$`Publication Year`<2023,] %>% count(`Publication Year`)

ggplot(data = year_ct, aes(x = `Publication Year`, y = n)) +
  geom_line() +
  labs(title = paste(min(year_ct$`Publication Year`),"to",max(year_ct$`Publication Year`)), y = "Publication Count")

How many were published in or before 1990? When did the first publication came out?

# Use DOI as the unique publication identifier, count the length of items in or before 1990
length(dat$id[dat$`Publication Year` <= 1990])

## [1] 16

# The earliest publication
dat[dat$`Publication Year` == min(dat$`Publication Year`),c("Article Title", "Publication Year", "Source Title","WoS Categories")]

## # A tibble: 1 × 4
## # Groups:   Publication Year [1]
##   `Article Title`             `Publication Year` `Source Title` `WoS Categories`
##   <chr>                                    <dbl> <chr>          <chr>           
## 1 SOME PSYCHO-SOCIAL CHARACT…               1957 HUMAN RELATIO… Management; Soc…

4.3 Discipline by Year

Qualitatively examine the earliest publications, then systematically investigate publications by decades.

4.3.1 Earliest Publications

Which disciplines were the earliest to study this? (<=1990)

# Which are the disciplines that published this topic in or before 1990?
cat_early <- trimws(unlist(strsplit(dat$`WoS Categories`[dat$`Publication Year` <=1990], split = ";")))
print_freq_tables(cat_early)

The earliest publication

dat[dat$`Publication Year` == min(dat$`Publication Year`),c("Publication Year","Source Title", "WoS Categories")]

## # A tibble: 1 × 3
## # Groups:   Publication Year [1]
##   `Publication Year` `Source Title`  `WoS Categories`                           
##                <dbl> <chr>           <chr>                                      
## 1               1957 HUMAN RELATIONS Management; Social Sciences, Interdiscipli…

dat$`Article Title`[dat$`Publication Year` == min(dat$`Publication Year`)] # Not available

## [1] "SOME PSYCHO-SOCIAL CHARACTERISTICS OF SATISFIED AND DISSATISFIED BRITISH IMMIGRANT SKILLED MANUAL WORKERS IN WESTERN AUSTRALIA"

Fill the blank abstract(s)

datatable(dat[dat$`Publication Year` <= 1990,c("Publication Year","Source Title", "Article Title")])

Earliest in PLSC/PADM

datatable(dat[plsc_padm_i,c("Publication Year","Source Title", "Article Title")])

4.3.2 Publication Count by Decade

Add a decade column

# Initiate a new column
dat$decade <- NA
# Assign decade-specific values
dat$decade[dat$`Publication Year` <= 1990] <- "t80"
dat$decade[dat$`Publication Year` >= 1991 & dat$`Publication Year` <= 2000] <- "t90"
dat$decade[dat$`Publication Year` >= 2001 & dat$`Publication Year` <= 2010] <- "t00"
dat$decade[dat$`Publication Year` >= 2011 & dat$`Publication Year` <= 2020] <- "t10"
dat$decade[dat$`Publication Year` >= 2021 & dat$`Publication Year` <= 2030] <- "t20"
# Make it an ordered factor
dat$decade <- factor(dat$decade, ordered = T, levels = c("t80", "t90", "t00", "t10", "t20"))
levels(dat$decade)

## [1] "t80" "t90" "t00" "t10" "t20"

print_freq_tables(dat$decade)

4.3.3 Share of PLSC/PADM

Get id for PLSC/PADM for the 1991-2020 data

id_plsc <- dat$id[grep(pattern = "Political Science", dat$`WoS Categories`)]
id_padm <- dat$id[grep(pattern = "Public Administration", dat$`WoS Categories`)]
## ids for PLSC & PADM publications
id_plsc_padm <- union(id_padm, id_plsc)

Count of PLSC PADM in 90s, 00s, and 10s

dat_9010 <- dat[dat$decade %in% c("t90","t00","t10"),]
# Count of PLSC PADM in 90s, 00s, and 10s
c90 <- nrow(dat_9010[dat_9010$id %in% c(id_plsc, id_padm) & dat_9010$decade == "t90",]);c90

## [1] 9

c00 <- nrow(dat_9010[dat_9010$id %in% c(id_plsc, id_padm) & dat_9010$decade == "t00",]);c00

## [1] 27

c10 <- nrow(dat_9010[dat_9010$id %in% c(id_plsc, id_padm) & dat_9010$decade == "t10",]);c10

## [1] 82

c20 <- nrow(dat[dat$id %in% c(id_plsc, id_padm) & dat$decade == "t20",]);c20

## [1] 28

Share of PLSC PADM in total in 90s, 00s, and 10s

100*c90/length(id90)

## [1] 6.766917

100*c00/length(id00)

## [1] 5.465587

100*c10/length(id10)

## [1] 5.398288

100*c20/length(id20)

## [1] 6.153846

4.3.4 New Disciplines in 00s and 10s

Only the three decades

nrow(dat_9010)

## [1] 2146

# Use as.character to wrap the decade variable to avoid non-applicable t80 and t20 from appearing from the table
print_freq_tables(as.character(dat_9010$decade))

How many disciplines are within 1991 and 2020?

Single out the vector of disciplines. 2. Check their presence in each decade.

cat_list <- as.character(cat_freq$Var1)
## Get a cat_decade dataframe
dec_cat <- separate_rows(data = dat[,c("decade", "WoS Categories")], "WoS Categories", sep = ";", convert = T)
dec_cat$`WoS Categories` <- trimws(dec_cat$`WoS Categories`)
## Which are the disciplines in the 80s and before?
print_freq_tables(dec_cat$`WoS Categories`[dec_cat$decade == "t80"])

## Which are the disciplines in the 90s?
print_freq_tables(dec_cat$`WoS Categories`[dec_cat$decade == "t90"])

## Which are the disciplines in the 00s?
print_freq_tables(dec_cat$`WoS Categories`[dec_cat$decade == "t00"])

## Which are the disciplines in the 10s?
print_freq_tables(dec_cat$`WoS Categories`[dec_cat$decade == "t10"])

## Which are the disciplines in the 20s?
print_freq_tables(dec_cat$`WoS Categories`[dec_cat$decade == "t20"])

4.3.5 More concentrated or diffused?

Is this thread of literature becoming more diffused or more concentrated?

top3dis_id <- unique(dat$id[grepl(pattern = "Economics|Demography|Geography", dat$`WoS Categories`, ignore.case = T)])
length(top3dis_id)

## [1] 1546

## 1990s, share of top 3 disciplines: Econ, Demo, Geo
length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't90']])

## [1] 91

length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't90']]) / nrow(dat[dat$decade == "t90",])

## [1] 0.6842105

## 2000s, share of top 3 disciplines: Econ, Demo, Geo
length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't00']])

## [1] 320

length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't00']]) / nrow(dat[dat$decade == "t00",])

## [1] 0.6477733

## 2010s
length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't10']])

## [1] 874

length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't10']]) / nrow(dat[dat$decade == "t10",])

## [1] 0.5753785

## 2020s
length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't20']])

## [1] 246

length(top3dis_id[top3dis_id %in% dat$id[dat$decade == 't20']]) / nrow(dat[dat$decade == "t20",])

## [1] 0.5406593

Which are the “new” disciplines in this literature?

## New disciplines emerged in the 00s, comparing to the 90s
new_cat_00 <- setdiff(unique(dec_cat$`WoS Categories`[dec_cat$decade == "t00"]), unique(dec_cat$`WoS Categories`[dec_cat$decade == "t90"]));head(new_cat_00)

## [1] "Social Sciences, Mathematical Methods"
## [2] "Engineering, Industrial"              
## [3] "Agricultural Economics & Policy"      
## [4] "Family Studies"                       
## [5] "Health Policy & Services"             
## [6] "Psychology, Clinical"

## New disciplines emerged in the 10s, comparing to the 00s
new_cat_10 <- setdiff(unique(dec_cat$`WoS Categories`[dec_cat$decade == "t10"]), unique(dec_cat$`WoS Categories`[dec_cat$decade == "t00"]));length(new_cat_10)

## [1] 43

new_cat_10 <- setdiff(new_cat_10, unique(dec_cat$`WoS Categories`[dec_cat$decade == "t90"]))
length(new_cat_10);head(new_cat_10)

## [1] 39

## [1] "Behavioral Sciences"      "Psychology, Experimental"
## [3] "Evolutionary Biology"     "Linguistics"             
## [5] "Psychology, Educational"  "Ecology"

Which are the topics in the newer disciplines?

## First, get an id-discipline dataframe, make sure to include decade info for later on
cat_id <- separate_rows(data = dat[, c("WoS Categories", "id", "decade")], "WoS Categories", sep = ";", convert = T)
cat_id$`WoS Categories` <- trimws(cat_id$`WoS Categories`)
head(cat_id)

## # A tibble: 6 × 3
##   `WoS Categories`                   id    decade
##   <chr>                              <chr> <ord> 
## 1 Management                         57_1  t80   
## 2 Social Sciences, Interdisciplinary 57_1  t80   
## 3 Economics                          73_1  t80   
## 4 Demography                         77_1  t80   
## 5 Demography                         77_2  t80   
## 6 Demography                         77_3  t80

## Now, we have the discipline vectors for the 2000s and 2010s, and the discipline-id table. 
## Get the article ids with emerging disciplines in the literature
### The length of the ids in the 2000s and the 2010s are the number publications from new disciplines
id00_new_cat <- unique(cat_id$id[cat_id$`WoS Categories` %in% new_cat_00]); length(id00_new_cat)

## [1] 250

id10_new_cat <- unique(cat_id$id[cat_id$`WoS Categories` %in% new_cat_10]); length(id10_new_cat)

## [1] 131

5 Topic

The skilled migration publications have expanded over the past three decades. The large number and a relatively long history of this literature enables a thorough investigation to its evolution and variation across disciplines. What was the mainstream topic in this literature? Did the mainstream topic evolve over time? How did different disciplines approach this topic? What specific policy and programs have been covered by political science and public administration scholars?

5.1 Mainsteam

Get keyword frequency

keyword <- separate_rows(data = dat[nchar(dat$keywords)>1,c("keywords","decade", "id")], "keywords", sep = ";", convert = T)
keyword$keywords <- trimws(keyword$keywords)
keyword <- keyword[nchar(keyword$keywords)>1,]
head(keyword)

## # A tibble: 6 × 3
##   keywords                     decade id   
##   <chr>                        <ord>  <chr>
## 1 ENGLISH-LANGUAGE PROFICIENCY t90    91_2 
## 2 LABOR-MARKET                 t90    91_2 
## 3 HISPANIC MEN                 t90    91_2 
## 4 UNEMPLOYMENT                 t90    91_3 
## 5 MIGRATION                    t90    91_3 
## 6 POLICY                       t90    91_3

Merge keywords with cat_id by the id column

key_cat <- merge(keyword, cat_id, by = "id", all.x = T)
key_cat <- key_cat[nchar(key_cat$keywords)>1,] # eliminate ' ' place holder, keep >=2 characters
key_cat$keywords <- toupper(key_cat$keywords)

All-time keyword frequency

Remove stopwords. Stopwords include the search terms in this query. Their presence does not add information.

stopwords <- c("migration","immigration","immigrant","immigrants","migrants","migrant","skilled migration", "skilled migrants","emigration","skills","skill")
stopwords <- toupper(stopwords)
## Remove stopwords from the keyword-discipline data
keyword <- keyword[!keyword$keywords %in% stopwords,]
key_cat <- key_cat[!key_cat$keywords %in% stopwords,]

5.1.1 Keyword Frequency

keyword_freq <- freq_tables(keyword$keywords[!key_cat$keywords %in% stopwords])
print_freq_tables(keyword$keywords)

top20_keyword_freq <- paste(keyword_freq$var[1:20]," (", keyword_freq$Freq[1:20], ", ", keyword_freq$pct[1:20],")", sep = "")
# collapse to concatenate a vector of strings
paste(tolower(top20_keyword_freq), collapse = ", ")

## [1] "impact (239, 0.69%), mobility (238, 0.69%), international migration (209, 0.61%), gender (188, 0.54%), employment (175, 0.51%), earnings (174, 0.5%), labor (165, 0.48%), education (159, 0.46%), growth (154, 0.45%), brain-drain (136, 0.39%), labor-market (125, 0.36%), unemployment (117, 0.34%), networks (116, 0.34%), brain drain (105, 0.3%), united-states (99, 0.29%), policy (99, 0.29%), women (97, 0.28%), wages (96, 0.28%), trade (96, 0.28%), workers (95, 0.28%)"

5.1.2 Themes from Keywords (patterns)

The keywords were combined with three strategies. The first strategy is to combine all phrases that include the target keyword. For example, impact was the most frequent word. Phrases such as “economic impact”, “demographic impact” were included in the impact group. The second strategy is to combine spelling variations, plural forms, and synonyms. The third strategy is to combine keywords of different meaning but belong to the same category. For example, I grouped sex (gender, women, men), family (generation, family, fertility, children), age (aging, youth, young, older), and race (race, ethnic) related keywords into a broader demographic characteristics category. Also combined here were the policy related words, including policies, regulation, law, rule, and politics.

## Strategy one, sub-string matching
#pat_mobility = "mobility"
#pat_impact = "impact"
#pat_network = "network"

## Strategy two, spelling variations, plural forms, synonyms
pat_income = "wage|earning|income|salary"
pat_employ = "employ|labor|labour|job"
#pat_brain = "brain drain|brain-drain"
#pat_growth = "growth"

## Strategy three, words belong to the same category/family
pat_education = "education|learn|student|university"
pat_policy = "policy|policies|regulation|rule| law"
pat_politics = "politics|political"
#pat_pol = paste(pat_policy, pat_politics, sep = "|"); pat_pol
#pat_demo = "aging|youth|young|older|gender|woman|women|family|families|fertility|^man$|^men$|children|generation|race|identity|ethnic" # de-activate for now
#pat_attitude = "attitude|opposition|polarization|opinion|perception|segregation|prejudice|equality|integration|discrimination" # de-activate for now
#pat_equal = "segregat|equality|integration|discriminat|prejudice"

# How many publications have keywords
keyword_denomi <- length(unique(keyword$id))
ids_w_key <- unique(keyword$id)

5.1.3 Theme-Frequency Matrix

Add columns to the main data

## Create variable names. Search all objects start with pat_ in .GlobalEnv
pat <- grep("pat_",names(.GlobalEnv),value=TRUE)


## get the content of each object into a list
patterns_l <- do.call("list", mget(pat))

## convert the list to a data frame
pattern_df <- data.frame(do.call(rbind, patterns_l))
dim(pattern_df); colnames(pattern_df) <- "patterns"

## [1] 5 1

pattern_df$theme <- pat
# sort the themes by alphabetical order
pattern_df <- pattern_df[order(pattern_df$theme),] 
rownames(pattern_df) <- NULL
# extract variable names AFTER the themes were sorted
varnames <- gsub(pattern = "pat_", replacement = "", pattern_df$theme); varnames

## [1] "education" "employ"    "income"    "policy"    "politics"

## experiment with new variable names
newnames <- varnames
for (i in 1:length(varnames)) {
  if(newnames[i] == "demo") {
    newnames[i] <- "demographic_characteristics"
  } else if (newnames[i] == "edu") {
    newnames[i] <- "education"
  } else if (newnames[i] == "employ") {
    newnames[i] <- "employment"
  } else if (newnames[i] == "brain") {
    newnames[i] <- "brain_drain"
  }
}
varnames <- newnames

## initiate an empty data frame
keyword_bi <- data.frame(matrix(ncol = length(varnames), nrow = nrow(dat)))
colnames(keyword_bi) <- varnames

## match 0s and 1s of the patterns in the keywords.
for (i in 1:ncol(keyword_bi)) {
  keyword_bi[,i] <- ifelse(grepl(pattern = pattern_df$patterns[i], dat$keywords, ignore.case = T), 1, 0)
}

## column sum of keywords
colSums(keyword_bi)

##  education employment     income     policy   politics 
##        343        954        422        344        137

## Bind the colSums freq and % into one dataframe
keyword_colsum <- data.frame(cbind(
  keyword = varnames,
  Freq = colSums(keyword_bi),
  pct = paste(round(100*colSums(keyword_bi)/keyword_denomi,2), "%", sep = "")
))

## Add the patterns to the data frame of the keyword/theme list
keyword_colsum$pattern <- pattern_df$patterns
keyword_colsum <- keyword_colsum[rev(order(keyword_colsum$Freq)),]
rownames(keyword_colsum) <- 1:nrow(keyword_colsum)
keyword_colsum$keyword <- trimws(keyword_colsum$keyword)
datatable(keyword_colsum, caption = "Keyword, Frequency, and Search Patterns")

## Number of publications with at least one of the designated categories
length(which(rowSums(keyword_bi)>0))

## [1] 1505

## % among all publications with keywords
length(which(rowSums(keyword_bi)>0)) / keyword_denomi

## [1] 0.6190868

Printed out the descending keyword, frequency, and %

After categorizing the keywords, the following 5 themes emerged.

They are employment (954, 39.24%), income (422, 17.36%), policy (344, 14.15%), education (343, 14.11%), politics (137, 5.64%). These themes cover 1505 (61.91%) of the 2431 publications with keywords.

paste(paste(keyword_colsum$keyword, " (", keyword_colsum$Freq,", ", keyword_colsum$pct, ")", sep = ""), collapse = ", ")

## [1] "employment (954, 39.24%), income (422, 17.36%), policy (344, 14.15%), education (343, 14.11%), politics (137, 5.64%)"

5.1.4 Theme-Frequency Matrix

# use the same method to construct a binary matrix that query from the main text (Title + Abstract)
text_bi <- data.frame(matrix(ncol = length(varnames), nrow = nrow(dat)))
colnames(text_bi) <- varnames
## match 0s and 1s of the patterns in the keywords.
for (i in 1:ncol(text_bi)) {
  text_bi[,i] <- ifelse(grepl(pattern = pattern_df$patterns[i], dat$text, ignore.case = T), 1, 0)
}

## column sum of keywords
colSums(text_bi)

##  education employment     income     policy   politics 
##        698       1659        667        892        225

## Bind the colSums freq and % into one data frame
text_colsum <- data.frame(cbind(
  keyword = varnames,
  Freq = colSums(text_bi), # characters by default
  pct = paste(round(100*colSums(text_bi)/nrow(dat),2), "%", sep = "")
))
text_colsum$Freq <- as.numeric(text_colsum$Freq) # very strange, the default class of frequency is character
text_colsum <- text_colsum[rev(order(text_colsum$Freq)),]
rownames(text_colsum) <- 1:nrow(text_colsum)

datatable(text_colsum, caption = "Patterns matches in titles and abstracts")

5.1.5 Theme Coverage in Title & Abstract

## Number of publications with at least one of the designated categories
length(which(rowSums(text_bi)>0))

## [1] 2263

## % among all publications with keywords
length(which(rowSums(text_bi)>0)) / nrow(dat)

## [1] 0.8647306

I then tested the performance of themes extracted from the keywords by applying them in the article titles and abstracts. The performance of keyword-based themes turned out to provide 86.47% coverage of all the publications. This is the distribution of the themes when matching them in the titles and abstracts: employment (1659, 63.39%), policy (892, 34.08%), education (698, 26.67%), income (667, 25.49%), politics (225, 8.6%).

For example, the previous example for text processing now has its matched themes. According to the methods, the original text body is expressed as employment, policy, politics.

paste(dat$`Article Title`[100], dat$Abstract[100], sep = ' ')

## [1] "New migrations in the Asia-Pacific region: a force for social and political change A rapid increase in international migration is a central aspect of the social transformations currently taking place in the Asia-Pacific region. Population movements take many forms, including permanent migration, temporary labour migration, mobility of highly skilled personnel, refugee movements and family reunion. Destinations include North America, the Gulf oil states and - increasingly - the fast-growing 'tiger economies' of Asia. Much of the migration is undocumented and a growing proportion of the migrants are women. So far, researchers and policy-makers have concentrated on short-term economic and regulatory aspects. But migration is likely to be a major factor bringing about social and political change in the region. The social networks which develop as part of the migratory process often make official migration control policies difficult to implement. Unplanned settlement is taking place, with important consequences for both sending and receiving societies. Scholars from a number of countries in the region have therefore established an Asia Pacific Migration Research Network to study these issues, to raise public awareness and to provide advice to policymakers. The article describes the aims and development of this Network, which is part of the UNESCO Management of Social Transformations Programme."

text_bi[100,][,colSums(text_bi[100,])>0]

##     employment policy politics
## 100          1      1        1

names(text_bi[100,][,colSums(text_bi[100,])>0])

## [1] "employment" "policy"     "politics"

5.2 Theme Validation (`tfidf`)

See the term frequencies in the main text, including bi-grams.

Two purposes: (1) to capture new themes, and (2) to include more patterns into the themes. E.g., educational, educate, academic

tf_df_list <- vector(mode = "list")
for (i in dat$id) {
  tf_df_list[[i]] <- data.frame(table(c(
    # temporarily exclude single words
    #str_tokenize(dat$text[dat$id == i], split = " "),
    bi_gram(str_tokenize(dat$text[dat$id == i], split = " ")),
    tri_gram(str_tokenize(dat$text[dat$id == i], split = " "))
  )))
  tf_df_list[[i]]$id <- i
}
paste("The following step takes a long time to run. It begins at", Sys.time())

## [1] "The following step takes a long time to run. It begins at 2023-08-30 23:00:01"

tf_df <- data.frame(do.call(rbind, tf_df_list))
tf_df$Var1 <- as.character(tf_df$Var1)
rownames(tf_df) <- NULL
paste("The term-frequency data frame conversion finished at", Sys.time(), "/ The term-frequency data has", format(dim(tf_df)[1], big.mark = ","), "rows.")

## [1] "The term-frequency data frame conversion finished at 2023-08-30 23:01:26 / The term-frequency data has 563,932 rows."

# Next, remove the terms that only appeared few times in all documents.
tf_df_table <- data.frame(table(
  tf_df$Var1
))
tf_df_table$Var1 <- as.character(tf_df_table$Var1)

paste("Across all documents, there are", format(length(tf_df_table$Var1), big.mark = ","), "unique terms (including the customized range n-grams). The frequency in this table means the number of documents that contain the pattern.")

## [1] "Across all documents, there are 453,868 unique terms (including the customized range n-grams). The frequency in this table means the number of documents that contain the pattern."

## Retain the terms that appeared in many documents.
thres_dfi = length(dat$id)*0.005
paste("Keep the terms that are common enough. E.g., keep the terms that appear in more than 0.5% of the documents:", thres_dfi)

## [1] "Keep the terms that are common enough. E.g., keep the terms that appear in more than 0.5% of the documents: 13.085"

tf_df_table <- tf_df_table[tf_df_table$Freq > thres_dfi,]
paste("After removing the less frequent terms,", format(nrow(tf_df_table),big.mark = ","), "remained.")

## [1] "After removing the less frequent terms, 1,034 remained."

print("Continue filtering this table by removing the terms that were already covered in the themes.")

## [1] "Continue filtering this table by removing the terms that were already covered in the themes."

## JOIN ALL PATTERNS FROM ALL THEMES
tokenized_patterns <- unlist(strsplit(paste(paste(pattern_df$patterns),collapse = "|"), split = "|", fixed = T))
## notes on 08/12/2023, re-consider the removal of these words. Maybe I should keep the bigrams
joined_patterns <- paste(tokenized_patterns, collapse = "|")
# remove the string as long as it contains the theme keyword, bi-grams and tri-grams will be removed if they contain the theme keyword
words_remove <- as.character(tf_df_table$Var1[grepl(pattern = joined_patterns, tf_df_table$Var1)])
# remove the single-word that equals to the theme keyword only, retaining bi-grams and tri-grams that contain the theme word
#words_remove2 <- as.character(tf_df_table$Var1[tf_df_table$Var1 %in% joined_patterns])

paste("These words were already captured from the existing themes:", paste(words_remove, collapse = ","),". A total of", length(words_remove), "words maybe removed.")

## [1] "These words were already captured from the existing themes: development policy,economic policy,economic political,education level,education skill,education system,education train,educational attainment,educational level,effect wage,employment experience,employment opportunity,employment outcome,employment rate,find job,government policy,high education,high income,high income country,high skill labor,high skill labour,high wage,highly skill labour,immigrant employment,immigrant labor,immigration law,immigration policy,immigration wage,income country,income distribution,income inequality,income tax,increase labor,increase unemployment,increase wage,integration policy,international labor,international labour,international student,job market,job opportunity,job search,labor demand,labor force,labor market,labor market competition,labor market effect,labor market integration,labor market outcome,labor migration,labor mobility,labor supply,labour force,labour market,labour market experience,labour market integration,labour market outcome,labour migrant,labour migration,labour mobility,labour shortage,labour supply,level education,local labor,local labor market,local labour,local labour market,low income,low skill job,low skill labor,low skill labour,low wage,migrant labor,migrant labour,migration labour,migration policy,minimum wage,policy can,policy change,policy debate,policy development,policy implication,policy make,policy maker,policy may,policy paper,policy practice,policy recommendation,policy reform,policy response,political economic,political economy,public policy,reduce wage,relative wage,return education,self employ,self employment,skill employee,skill employment,skill immigration policy,skill job,skill labor,skill labour,skill labour migration,skill unskilled labor,skill unskilled wage,skill wage,social political,unemployment rate,unskilled labor,unskilled labour,unskilled wage,unskilled wage inequality,wage differential,wage effect,wage employment,wage gap,wage increase,wage inequality,wage low,wage native,wage premium,wage skill . A total of 124 words maybe removed."

#tf_df_table <- tf_df_table[!tf_df_table$Var1 %in% words_remove, ]
#nrow(tf_df_table)

## Also, remove single characters
tf_df_table <- tf_df_table[nchar(tf_df_table$Var1)>1, ]

## Remove entries containing only numbers

tf_df_table <- tf_df_table %>% filter(is.na(as.numeric(Var1)))

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `is.na(as.numeric(Var1))`.
## Caused by warning:
## ! NAs introduced by coercion

## Remove entries topic-specific generic terms

other_removewords <- c("migrant","migration", "immigrant", "immigration", "emigrant", "emigration", "skilled", "skill")
tf_df_table <- tf_df_table[!tf_df_table$Var1 %in% other_removewords, ]
tf_df_table <- tf_df_table[!grepl(pattern = paste(other_removewords, collapse = "|"), tf_df_table$Var1), ]


paste("Now, the term-frequency table has", format(nrow(tf_df_table),big.mark = ","), "terms.")

## [1] "Now, the term-frequency table has 759 terms."

## Shorten the `tf_df` data frame that only includes the remaining terms.
tf_df <- tf_df[tf_df$Var1 %in% tf_df_table$Var1, ]

TFIDF formula with smoothed inverse document frequency:

\[w_{i,j} = tf_{i,j} \times (log \frac{N}{df_{i} + 1} + 1)\]

Where,

\(tf_{i,j} =\) the number of occurrences of \(i\) in \(j\)

\(df_{i} =\) the number of documents containing \(i\)

\(N =\) total number of documents

# Now merge the following two data frames, `tf_df` and `tf_df_table`
class(tf_df$Var1) = "character"
class(tf_df_table$Var1) = "character"
tf_idf <- merge(tf_df, tf_df_table, by = "Var1", all.x = TRUE)
names(tf_idf) <- c("t","f","id","n_doc")
## Add the weight column that calculates the weight of each term in each document
tf_idf$w <- tf_idf$f*(log(length(unique(tf_idf$id)) / (tf_idf$n_doc + 1)) + 1)

## What are the words with the lowest weight?

paste("The following words were assigned to the lowest weights:", paste(unique(tf_idf$t[tf_idf$w < 1]),collapse = ", "),  "because they appeared in too many documents." )

## [1] "The following words were assigned to the lowest weights:  because they appeared in too many documents."

stripchart(tf_idf$w)

tf_idf <- tf_idf[tf_idf$w > 1, ]
nrow(tf_idf)

## [1] 21962

tf_idf <- tf_idf[!grepl(pattern = 'paper|study|conclusion|article|also|among|wiley|hypothesis|ltd|research|elsevier| much|empirical|copyright|c 20', tf_idf$t),]
nrow(tf_idf)

## [1] 17342

#unique(tf_idf$t)
paste("Having the words with the lowest weights removed, the remaining `tf-idf` table has", format(nrow(tf_idf), big.mark = ","), "rows." )

## [1] "Having the words with the lowest weights removed, the remaining `tf-idf` table has 17,342 rows."

# Order the weight for each document
tf_idf <- arrange(tf_idf, desc(w), group_by = id)
tf_idf <- tf_idf[order(tf_idf$id), ]
rownames(tf_idf) <- NULL
# Check the distribution of the weights
quantile(tf_idf$w, c(.5, .75, .8, .9, .95, .99))

##       50%       75%       80%       90%       95%       99% 
##  5.638218  6.084505  6.149044 10.979596 13.038703 24.450846

## rank the weights of terms within each id
tf_idf <- tf_idf %>% group_by(id) %>% mutate(rank = row_number())
## keep a reduced tf_idf with highest ranked terms for each id
quantile(tf_idf$w, .95)

##     95% 
## 13.0387

tf_idf_top <- tf_idf[tf_idf$rank <= 5 | tf_idf$w > quantile(tf_idf$w, .95), ]
nrow(tf_idf_top)

## [1] 11015

paste("The `tf_idf` data frame summarizes the top 3 (at least) highest weighted terms in", format(nrow(tf_idf_top), big.mark = ","), "rows.")

## [1] "The `tf_idf` data frame summarizes the top 3 (at least) highest weighted terms in 11,015 rows."

NOTES on 08/14/2023: many bi-grams are contained by tri-grams. Need additional revisions to the program that address this.

5.3 Theme by Decade

The following is the descending list of keywords by decade. Although the keywords are to be combined into themes, the frequency tables provide useful insights.

5.3.1 Keyword frequency by decade

# t90
print_freq_tables(keyword$keywords[keyword$decade == "t90"])

# t00
print_freq_tables(keyword$keywords[keyword$decade == "t00"])

# t10
print_freq_tables(keyword$keywords[keyword$decade == "t10"])

# t20
print_freq_tables(keyword$keywords[keyword$decade == "t20"])

5.3.2 Evolution of Themes by Decade

To examine the longitudinal patterns, I decided to select the publications in the past three decades, i.e., 1990s, 2000s, and 2010s. There are 2,146 publications that were published between 1991 and 2020, representing 82% of the total publications. Most publications excluded were published between 2021 and 2023—I will discuss them separately.

nrow(dat_9010)

## [1] 2146

nrow(dat_9010) / nrow(dat)

## [1] 0.8200229

To tabulate the theme distribution, either by the keyword matches or the title/abstract matches, I need to join the theme frequency matrix to be joined to the main data. I recommend use the theme frequency matrix of the title/abstract because the coverage is better. (Also, 165 publications had missing keywords, but only 36 publications had missing abstracts).

dat_tf <- cbind(dat[, c("Publication Year", "id","decade","Times Cited, WoS Core")], text_bi)

theme_dec <- data.frame(aggregate(dat_tf[, names(dat_tf) %in% varnames], by = list(decade = dat_tf$decade), sum), row.names = 'decade')
datatable(t(theme_dec))

Although the share of political science and public administration publications have been stable. The policy theme has moved up into the top three in the 2000s and 2010s, compared to the 1990s.

theme_dec_t <- data.frame(t(theme_dec))
# publication counts by decade
dec_ct <- c(length(id80), length(id90), length(id00), length(id10), length(id20))
names(dec_ct) <- as.character(unique(dat$decade))
paste("The publication count of these decades are:",paste(names(dec_ct), "-", dec_ct, collapse = ", "),sep = " ")

## [1] "The publication count of these decades are: t80 - 16, t90 - 133, t00 - 494, t10 - 1519, t20 - 455"

# this is to create a % data frame of the theme by decade table
theme_dec_t_pct <- data.frame(matrix(ncol = ncol(theme_dec_t), nrow = nrow(theme_dec_t)))
colnames(theme_dec_t_pct) <- colnames(theme_dec_t)
rownames(theme_dec_t_pct) <- rownames(theme_dec_t_pct)
## fill in proportion values based on publication counts of the theme and the total publications in the decade
for (i in 1:ncol(theme_dec_t)) {
  theme_dec_t_pct[,i] <- theme_dec_t[,i] / dec_ct[i]
}

# print top tive themes along with the publication numbers, looping over each decade
for (col in colnames(theme_dec_t)) {
  print(col)
  print(paste(paste(
    rownames(theme_dec_t[rev(order(theme_dec_t[, col])), ])[1:5],
    " (",
    sort(theme_dec_t[, col], decreasing = T)[1:5],
    ", ",
    paste(round(100*sort(theme_dec_t_pct[, col], decreasing = T),2)[1:5], "%", sep = ""), # from a different data frame
    ")",
    sep = ""
  ),
  collapse = ", ")) # the outer paste is to join different text strings
  
}

## [1] "t80"
## [1] "policy (1, 6.25%), income (1, 6.25%), employment (1, 6.25%), education (1, 6.25%), politics (0, 0%)"
## [1] "t90"
## [1] "employment (103, 77.44%), income (42, 31.58%), policy (41, 30.83%), education (21, 15.79%), politics (20, 15.04%)"
## [1] "t00"
## [1] "employment (328, 66.4%), policy (186, 37.65%), income (147, 29.76%), education (126, 25.51%), politics (41, 8.3%)"
## [1] "t10"
## [1] "employment (940, 61.88%), policy (496, 32.65%), education (426, 28.04%), income (377, 24.82%), politics (125, 8.23%)"
## [1] "t20"
## [1] "employment (287, 63.08%), policy (168, 36.92%), education (124, 27.25%), income (100, 21.98%), politics (39, 8.57%)"

rownames(theme_dec_t_pct)<- rownames(theme_dec_t)

datatable(round(theme_dec_t_pct,4), caption = "Theme by Decade by Proportion")

Modify the theme by decade data frame. Make the row names to one of the columns

theme_dec<- cbind(
  decade = rownames(theme_dec),
  theme_dec,
  row.names = NULL
)

## decades in row, theme by decade proportion
theme_dec_t_pct <- cbind(
  theme = rownames(theme_dec_t_pct),
  theme_dec_t_pct,
  row.names = NULL
)

More details, theme by published year

theme_yr <- data.frame(aggregate(dat_tf[, names(dat_tf) %in% varnames], by = list(year = dat_tf$`Publication Year`), sum))
datatable(theme_yr)

5.4 Theme by Discipline

head(key_cat)

##      id       keywords decade.x WoS Categories decade.y
## 1 00_10       MOBILITY      t90      Economics      t90
## 2 00_10       TAXATION      t90      Economics      t90
## 3 00_10 REDISTRIBUTION      t90      Economics      t90
## 4 00_10    EQUILIBRIUM      t90      Economics      t90
## 5 00_10        RETURNS      t90      Economics      t90
## 6 00_10         EQUITY      t90      Economics      t90

5.4.1 Top N Disciplines

top_cat <- as.character(cat_freq$Var1[1:20])
# Top N disciplines with the most publications
paste(top_cat, collapse = ", ")

## [1] "Economics, Demography, Geography, Sociology, Environmental Studies, Ethnic Studies, Management, Industrial Relations & Labor, Regional & Urban Planning, Social Sciences, Interdisciplinary, Political Science, Urban Studies, Development Studies, Area Studies, Business, Finance, Public, Environmental & Occupational Health, Business, Public Administration, Education & Educational Research, International Relations"

5.4.2 Keyword Frequency by Discipline

key_cat_allyr_list <- vector(mode = "list")
for(cat in top_cat) {
 key_cat_allyr_list[[cat]] <- freq_crosslist(key_cat$keywords[key_cat$`WoS Categories` == cat], denominator = nrow(dat))
}
key_cat_allyr <- data.frame(do.call(rbind, key_cat_allyr_list))
#key_cat_allyr$Discipline <- rownames(key_cat_allyr)
key_cat_allyr$Discipline <- sub(".[0-9]+", "",rownames(key_cat_allyr))
key_cat_allyr$Rank <- str_extract(rownames(key_cat_allyr), "[[:digit:]]+")
key_cat_allyr$Rank <- as.numeric(key_cat_allyr$Rank)
rownames(key_cat_allyr) <- NULL

datatable(key_cat_allyr[key_cat_allyr$Rank <=50 & key_cat_allyr$Frequency > 2,])

PLSC and PADM Keyword Frequency

print_fq_crlt(key_cat$keywords[key_cat$`WoS Categories` %in% c("Public Administration", "Political Science")], denominator = length(id_plsc_padm), header = "Keyword Frequency Table in Political Science and Public Administration")

5.4.3 Case IDs by Themes

Create a list of theme-based ids. Create different subsets of the theme_id because I will need them for decade tabulation

theme_id <- vector(mode = "list", length = length(varnames))
names(theme_id) <- varnames
# Fill in the ids for each theme, loop over varnames
for (var in varnames) {
  theme_id[[var]] <- dat_tf$id[dat_tf[, var] > 0] # double wrapping for list objects
}

## Create a data frame with cells filled with the ids of specific theme and decade
theme_dec_id <- data.frame(matrix
                          (ncol = length(varnames), nrow = length(unique(
                            dat$decade
                          ))),
                          row.names = unique(dat$decade))
colnames(theme_dec_id) <- varnames

for (decade in rownames(theme_dec_id)) {
  for (var in varnames) {
    theme_dec_id[decade, var] <- paste(
        dat_tf$id[dat_tf$decade == decade & dat_tf[, var] > 0], collapse = ",") # The length of each cell is 1, has to be tokenized for indexing
  }
}
theme_dec_id <- data.frame(cbind(
  decade = rownames(theme_dec_id),
  theme_dec_id,
  row.names = NULL
))
print("To index individual publication IDs, use the `str_tokenize` function, as defined at the beginning of this document.")

## [1] "To index individual publication IDs, use the `str_tokenize` function, as defined at the beginning of this document."

How about a theme by year id data frame?

theme_yr_id <- data.frame(matrix
                          (ncol = length(varnames), nrow = length(unique(
                            dat$`Publication Year`
                          ))),
                          row.names = unique(dat$`Publication Year`))
colnames(theme_yr_id) <- varnames
paste("The blank `theme by year` data frame of ids has", dim(theme_yr_id)[1], "rows, equals to the number of publication years; and", dim(theme_yr_id)[2], "columns, equals to the number of themes. Next, fill in the blank cells with publication ids.")

## [1] "The blank `theme by year` data frame of ids has 41 rows, equals to the number of publication years; and 5 columns, equals to the number of themes. Next, fill in the blank cells with publication ids."

## Fill in every cell by looping over the rows and the columns
for (year in rownames(theme_yr_id)) {
  for (var in varnames) {
    theme_yr_id[year, var] <- paste(
        dat_tf$id[dat_tf$`Publication Year`== year & dat_tf[, var] > 0], collapse = ",")
  }
}
theme_yr_id <- data.frame(cbind(
  decade = rownames(theme_yr_id),
  theme_yr_id,
  row.names = NULL
))
print("The `theme by year` data frame of ids has been filled. It's not for tabulation, but for indexing (updated on May 6, 2023).")

## [1] "The `theme by year` data frame of ids has been filled. It's not for tabulation, but for indexing (updated on May 6, 2023)."

datatable(theme_yr_id[10:12, 1:5], caption = "An illustration of the `theme by year` id table")

5.4.4 Theme by Discipline, All Years

Theme-discipline tabulation for all years

theme_cat_list <- vector(mode = "list")
for (theme in names(theme_id)) {
 theme_cat_list[[theme]] <- freq_crosslist(cat_id$`WoS Categories`[cat_id$id %in% theme_id[[theme]]], denominator = length(theme_id[[theme]]))
} 
## The following sequence should not be changed
theme_cat_allyr <- data.frame(do.call(rbind, theme_cat_list))
theme_cat_allyr$Theme <- rownames(theme_cat_allyr)
theme_cat_allyr$Rank <- str_extract(theme_cat_allyr$Theme, "[[:digit:]]+")
theme_cat_allyr$Rank <- as.numeric(theme_cat_allyr$Rank) ## default as character after extraction
theme_cat_allyr$Theme <- sub(".[0-9]+", "", rownames(theme_cat_allyr))
rownames(theme_cat_allyr) <- NULL

datatable(theme_cat_allyr[theme_cat_allyr$Frequency>1 & theme_cat_allyr$Rank <= 20,], caption = "Top Disciplines for Each Theme")

\(\color{red}{\text{How to read the above table?}}\)

## [1] "Interpretation of this table: The most productive discipline in the theme of EDUCATION is ECONOMICS"

5.4.5 Theme by Discipline by Decade

# This needs a different approach from the all-year tabulation. We need to go back deeper to get a subset of theme_ids that are within 1991_2020
# Source: cat_id, parameters: decade and theme

## ADDRESS THE FOLLOWING ERROR (May 7, 2023)
## Error in fq_crlt$Freq : $ operator is invalid for atomic vectors
## If `word_count((theme_dec_id[, theme])[theme_dec_id$decade == dec],split = ",")` == 1, then create a new data frame manually, with column names of "Term","Frequency","Share". If == 0, next

paste("The following are included in the loop:", paste(as.character(unique(cat_id$decade)), collapse = ","), ". It's safer to wrap a factor varible as character.")

## [1] "The following are included in the loop: t80,t90,t00,t10,t20 . It's safer to wrap a factor varible as character."

## Initiate a blank list
theme_dec_cat_list <- vector(mode = "list")

print("LOOP BEGINS:")

## [1] "LOOP BEGINS:"

for (dec in as.character(unique(cat_id$decade))) {
  for (theme in varnames) {
    theme_dec_cat_list[[dec]][[theme]] <-
      freq_crosslist(cat_id$`WoS Categories`[cat_id$id %in% str_tokenize(text = (theme_dec_id[, theme])[theme_dec_id$decade == dec],
                                                                         split = ",")],
                     denominator = word_count((theme_dec_id[, theme])[theme_dec_id$decade == dec],
                                              split = ","))
    print(
      paste(
        "Loop over",
        toupper(theme),
        "and",
        dec,
        "has completed. The output has",
        dim(theme_dec_cat_list[[dec]][[theme]])[1],
        "row(s)."
      ) # End of paste
    ) # End of print
  } # End of theme (inner) loop
} # End of decade(outer) loop

## [1] "Loop over EDUCATION and t80 has completed. The output has 1 row(s)."
## [1] "Loop over EMPLOYMENT and t80 has completed. The output has 1 row(s)."
## [1] "Loop over INCOME and t80 has completed. The output has 3 row(s)."
## [1] "Loop over POLICY and t80 has completed. The output has 1 row(s)."
## [1] "The expected multi-element vector has no elements, a place holder row with `Frequency = 0` is introduced."
## [1] "Loop over POLITICS and t80 has completed. The output has 1 row(s)."
## [1] "Loop over EDUCATION and t90 has completed. The output has 16 row(s)."
## [1] "Loop over EMPLOYMENT and t90 has completed. The output has 28 row(s)."
## [1] "Loop over INCOME and t90 has completed. The output has 16 row(s)."
## [1] "Loop over POLICY and t90 has completed. The output has 20 row(s)."
## [1] "Loop over POLITICS and t90 has completed. The output has 15 row(s)."
## [1] "Loop over EDUCATION and t00 has completed. The output has 34 row(s)."
## [1] "Loop over EMPLOYMENT and t00 has completed. The output has 48 row(s)."
## [1] "Loop over INCOME and t00 has completed. The output has 34 row(s)."
## [1] "Loop over POLICY and t00 has completed. The output has 42 row(s)."
## [1] "Loop over POLITICS and t00 has completed. The output has 22 row(s)."
## [1] "Loop over EDUCATION and t10 has completed. The output has 67 row(s)."
## [1] "Loop over EMPLOYMENT and t10 has completed. The output has 76 row(s)."
## [1] "Loop over INCOME and t10 has completed. The output has 44 row(s)."
## [1] "Loop over POLICY and t10 has completed. The output has 66 row(s)."
## [1] "Loop over POLITICS and t10 has completed. The output has 42 row(s)."
## [1] "Loop over EDUCATION and t20 has completed. The output has 43 row(s)."
## [1] "Loop over EMPLOYMENT and t20 has completed. The output has 52 row(s)."
## [1] "Loop over INCOME and t20 has completed. The output has 34 row(s)."
## [1] "Loop over POLICY and t20 has completed. The output has 50 row(s)."
## [1] "Loop over POLITICS and t20 has completed. The output has 26 row(s)."

print("After repeated testing, the `freq_crosslist` function, expected to tabulate a vector of multiple elements, can now handle atom values or `NA` values. If the loop meets an atom value, Frequency = 1 and Share = 1, the later introduced `Rank` variable will also be set to 1.  If the loop meets an `NA` cell, a `Place_holder` row will be introduced with Frequency = 0. Just remove the rows with Frequency = 0 to remove the original `NA`s.")

## [1] "After repeated testing, the `freq_crosslist` function, expected to tabulate a vector of multiple elements, can now handle atom values or `NA` values. If the loop meets an atom value, Frequency = 1 and Share = 1, the later introduced `Rank` variable will also be set to 1.  If the loop meets an `NA` cell, a `Place_holder` row will be introduced with Frequency = 0. Just remove the rows with Frequency = 0 to remove the original `NA`s."

## Added Theme and Rank from the rownames, to combine all decades, create a list of df_list that store individual data frames of the decade (finished on May 7, 2023)
theme_dec_cat_list_df_interim <- vector(mode = "list")

for (dec in as.character(unique(cat_id$decade))) {
  theme_dec_cat_list_df_interim[[dec]]<- data.frame(cbind(do.call(rbind, theme_dec_cat_list[[dec]])),decade = dec)
  # Make sure all frequencies are larger than 0, this is to remove the place holder rows generated from variables without any elements.
  theme_dec_cat_list_df_interim[[dec]] <- (theme_dec_cat_list_df_interim[[dec]])[theme_dec_cat_list_df_interim[[dec]]$Frequency > 0, ]
  # Create a Theme column 
  theme_dec_cat_list_df_interim[[dec]]$Theme <- rownames(theme_dec_cat_list_df_interim[[dec]]) # not final
  theme_dec_cat_list_df_interim[[dec]]$Rank <- str_extract(theme_dec_cat_list_df_interim[[dec]]$Theme, "[[:digit:]]+") # Extract digits from theme
  ## If there is only one row, Rank introduces NA, modify the NAs to 1
  theme_dec_cat_list_df_interim[[dec]]$Rank[is.na(theme_dec_cat_list_df_interim[[dec]]$Rank)] <- 1
  theme_dec_cat_list_df_interim[[dec]]$Rank <- as.numeric(theme_dec_cat_list_df_interim[[dec]]$Rank)
  theme_dec_cat_list_df_interim[[dec]]$Theme <- sub(".[0-9]+", "", theme_dec_cat_list_df_interim[[dec]]$Theme)
  rownames(theme_dec_cat_list_df_interim[[dec]]) <- NULL
}
## Combine the interim list into a single data frame
theme_dec_cat_df <- data.frame(do.call(rbind, theme_dec_cat_list_df_interim), row.names = NULL)
datatable(theme_dec_cat_df[theme_dec_cat_df$Rank <=10, ], caption = "Top 10 Disciplines of Each Theme by Decade")

5.4.6 Policy

### 1990s
top10policy90s <- tolower(theme_dec_cat_df$Term[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t90" & theme_dec_cat_df$Rank <= 10])
top10policy90sfreq <- paste(theme_dec_cat_df$Frequency[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t90" & theme_dec_cat_df$Rank <= 10], sep = "")

top10policy90spct <- paste(round(100*theme_dec_cat_df$Share[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t90" & theme_dec_cat_df$Rank <= 10],2), "%", sep = "")
#paste(top10policy90s, " (", top10policy90spct, ")", sep = "")
paste("Top 10 disciplines that published in the policy theme in the 1990s were", paste(paste(top10policy90s, " (",top10policy90sfreq, ", ", top10policy90spct, ")", sep = ""), collapse = ", "))

## [1] "Top 10 disciplines that published in the policy theme in the 1990s were economics (13, 31.71%), demography (11, 26.83%), geography (6, 14.63%), environmental studies (6, 14.63%), social sciences, interdisciplinary (4, 9.76%), urban studies (3, 7.32%), sociology (3, 7.32%), social issues (3, 7.32%), political science (3, 7.32%), regional & urban planning (2, 4.88%)"

### 2000s
top10policy00s <- tolower(theme_dec_cat_df$Term[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t00" & theme_dec_cat_df$Rank <= 10])
top10policy00sfreq <- paste(theme_dec_cat_df$Frequency[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t00" & theme_dec_cat_df$Rank <= 10], sep = "")

top10policy00spct <- paste(round(100*theme_dec_cat_df$Share[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t00" & theme_dec_cat_df$Rank <= 10],2), "%", sep = "")
#paste(top10policy90s, " (", top10policy90spct, ")", sep = "")
paste("Top 10 disciplines that published in the policy theme in the 2000s were", paste(paste(top10policy00s, " (",top10policy00sfreq, ", ", top10policy00spct, ")", sep = ""), collapse = ", "))

## [1] "Top 10 disciplines that published in the policy theme in the 2000s were economics (66, 35.48%), demography (38, 20.43%), geography (20, 10.75%), development studies (15, 8.06%), sociology (13, 6.99%), social sciences, interdisciplinary (12, 6.45%), environmental studies (11, 5.91%), area studies (10, 5.38%), political science (9, 4.84%), regional & urban planning (8, 4.3%)"

### 2010s


top10policy10s <- tolower(theme_dec_cat_df$Term[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t10" & theme_dec_cat_df$Rank <= 10])
top10policy10sfreq <- paste(theme_dec_cat_df$Frequency[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t10" & theme_dec_cat_df$Rank <= 10], sep = "")

top10policy10spct <- paste(round(100*theme_dec_cat_df$Share[theme_dec_cat_df$Theme == "policy" & theme_dec_cat_df$decade == "t10" & theme_dec_cat_df$Rank <= 10],2), "%", sep = "")
#paste(top10policy90s, " (", top10policy90spct, ")", sep = "")
paste("Top 10 disciplines that published in the policy theme in the 2010s were", paste(paste(top10policy10s, " (",top10policy10sfreq, ", ", top10policy10spct, ")", sep = ""), collapse = ", "))

## [1] "Top 10 disciplines that published in the policy theme in the 2010s were economics (141, 28.43%), demography (115, 23.19%), geography (47, 9.48%), industrial relations & labor (32, 6.45%), ethnic studies (29, 5.85%), sociology (28, 5.65%), political science (27, 5.44%), social sciences, interdisciplinary (26, 5.24%), management (25, 5.04%), environmental studies (20, 4.03%)"

# If a loop is needed, create a list for the following parameters
#paste("Top", n, "disciplines that published in the", theme, "in the", dec, paste( paste(cat, " (", freq, ", ", pct, ")", sep = ""), collapse = ", " ))

6 Data Quick View

names(dat)

##  [1] "Author Full Names"     "Article Title"         "Source Title"         
##  [4] "Abstract"              "Times Cited, WoS Core" "Publication Year"     
##  [7] "WoS Categories"        "id"                    "id2"                  
## [10] "keywords"              "text"                  "decade"

datatable(dat[,c("id", "Author Full Names", "Article Title", "Source Title", "WoS Categories")])

7 Follow-up Analysis

7.1 Network-Friendly Data

text_bi_copy <- text_bi
dim(text_bi_copy)

## [1] 2617    5

rownames(text_bi_copy) <- dat$id
write.csv(text_bi_copy,'migs_two_mode.csv', row.names = T)
text_bi_copy$decade <- dat$decade
## 1990s, 2-mode
two_mode_90 <- text_bi_copy[text_bi_copy$decade == 't90', !names(text_bi_copy) %in% c('decade')]
## 2000s
two_mode_00 <- text_bi_copy[text_bi_copy$decade == 't00', !names(text_bi_copy) %in% c('decade')]
## 2010s
two_mode_10 <- text_bi_copy[text_bi_copy$decade == 't10', !names(text_bi_copy) %in% c('decade')]

### output
#write.csv(two_mode_90, 'two_mode_90.csv')
#write.csv(two_mode_00, 'two_mode_00.csv')
#write.csv(two_mode_10, 'two_mode_10.csv')

7.2 Most-Cited Themes

Tabulate the top 10% of highly cited articles, regardless of themes.

hi_cite <- data.frame(cbind(
  dat[, c("id", "decade","Times Cited, WoS Core")], text_bi
))
names(hi_cite)[grepl(pattern = 'Cited', names(hi_cite))] <- 'cited'
# create a copy of hi_cite, named df_cite, because hi_cite will be sliced to top 10%
df_cite <- hi_cite
# top 10% of each decade
# create a list of dataframes for each decade
hi_cite_decade <- list()
top_10pct_cutoff <- c()
for (i in 1: length(unique(hi_cite$decade))) {
  hi_cite_decade[[i]] <- hi_cite[hi_cite$decade == unique(hi_cite$decade)[i],]
  # desc sort citation
  hi_cite_decade[[i]] <- hi_cite_decade[[i]][rev(order(hi_cite_decade[[i]]$cited)),]
  # calculate top 10% cutoff
  top_10pct_cutoff[i] <- nrow(hi_cite_decade[[i]])*0.1
  # only keep rows belong to top 10% most cited
  hi_cite_decade[[i]] <- (hi_cite_decade[[i]])[1:top_10pct_cutoff[i], ]
  print(unique(hi_cite$decade)[i])
  print("How many articles belong to the top 10% most cited in this decade?")
  print(nrow(hi_cite_decade[[i]]))
}

## [1] t80
## Levels: t80 < t90 < t00 < t10 < t20
## [1] "How many articles belong to the top 10% most cited in this decade?"
## [1] 1
## [1] t90
## Levels: t80 < t90 < t00 < t10 < t20
## [1] "How many articles belong to the top 10% most cited in this decade?"
## [1] 13
## [1] t00
## Levels: t80 < t90 < t00 < t10 < t20
## [1] "How many articles belong to the top 10% most cited in this decade?"
## [1] 49
## [1] t10
## Levels: t80 < t90 < t00 < t10 < t20
## [1] "How many articles belong to the top 10% most cited in this decade?"
## [1] 151
## [1] t20
## Levels: t80 < t90 < t00 < t10 < t20
## [1] "How many articles belong to the top 10% most cited in this decade?"
## [1] 45

paste("The top 10% most cited articles cut-off are",paste(top_10pct_cutoff, collapse = ', '))

## [1] "The top 10% most cited articles cut-off are 1.6, 13.3, 49.4, 151.9, 45.5"

# Get the total citations for all publications for each decade
tot_cite_decade<- df_cite %>%
  group_by(decade) %>%
  summarise(total_cite = sum(cited))
# top_10pct_cite_df
top_10pct_cite_df <- do.call(what = "rbind", lapply(hi_cite_decade, as.data.frame))
print("Top 10% most cited micro data:")

## [1] "Top 10% most cited micro data:"

top_10pct_cite_df

##          id decade cited education employment income policy politics
## 7      86_1    t80   135         0          0      0      0        0
## 42     94_8    t90   263         0          1      1      0        0
## 18     91_2    t90   222         0          0      1      0        0
## 65    96_14    t90   216         0          1      0      0        0
## 50     95_7    t90   199         0          1      0      0        0
## 116    00_1    t90   185         0          1      0      1        1
## 128   00_13    t90   180         0          0      0      1        0
## 64    96_13    t90   152         0          1      0      0        1
## 52     96_1    t90   145         0          1      0      0        0
## 75    97_10    t90   109         1          1      1      0        0
## 137   00_22    t90    93         0          1      0      0        0
## 145   00_30    t90    91         0          1      0      0        0
## 95    98_15    t90    83         0          1      0      0        0
## 54     96_3    t90    83         0          1      0      1        0
## 158    01_9    t00   831         0          1      1      0        0
## 313   06_18    t00   620         0          1      0      0        0
## 166   01_17    t00   589         0          1      1      1        0
## 526   09_68    t00   537         0          1      0      0        0
## 563   10_29    t00   525         0          1      0      0        0
## 346   07_11    t00   524         1          1      1      0        0
## 219   03_27    t00   482         0          1      0      0        0
## 215   03_23    t00   423         0          1      1      1        0
## 304    06_9    t00   394         0          1      0      0        0
## 426   08_34    t00   376         1          0      0      0        0
## 627   10_93    t00   365         0          0      0      0        0
## 267    05_9    t00   360         1          1      0      0        0
## 268   05_10    t00   351         0          0      1      0        0
## 570   10_36    t00   347         0          0      0      0        0
## 185   02_12    t00   319         0          0      0      0        0
## 279   05_21    t00   313         0          1      0      0        0
## 153    01_4    t00   293         1          1      0      0        1
## 536    10_2    t00   265         0          1      0      1        0
## 217   03_25    t00   256         1          1      0      0        0
## 165   01_16    t00   226         0          0      0      0        0
## 171   01_22    t00   225         0          0      0      0        0
## 169   01_20    t00   221         0          1      0      0        0
## 181    02_8    t00   218         0          1      0      1        0
## 295   05_37    t00   216         0          0      0      1        0
## 477   09_19    t00   212         0          1      1      0        0
## 408   08_16    t00   192         0          1      1      0        0
## 271   05_13    t00   183         0          1      0      0        0
## 521   09_63    t00   175         0          0      1      1        0
## 159   01_10    t00   175         1          1      1      0        0
## 363   07_28    t00   170         0          1      0      1        0
## 229    04_8    t00   166         0          1      0      1        0
## 296    06_1    t00   163         0          0      0      0        0
## 456   08_64    t00   162         0          1      0      1        0
## 440   08_48    t00   152         0          0      0      1        0
## 325   06_30    t00   151         0          1      0      0        0
## 332   06_37    t00   150         0          0      0      0        0
## 571   10_37    t00   149         0          0      0      1        0
## 373   07_38    t00   148         0          1      0      0        0
## 314   06_19    t00   146         0          1      0      1        0
## 285   05_27    t00   146         0          1      0      0        0
## 623   10_89    t00   144         1          0      0      0        0
## 210   03_18    t00   143         0          0      1      0        0
## 637  10_103    t00   140         0          0      0      0        0
## 351   07_16    t00   138         1          0      0      0        1
## 293   05_35    t00   138         0          1      0      0        0
## 466    09_8    t00   135         1          0      0      0        0
## 454   08_62    t00   135         0          1      0      1        0
## 287   05_29    t00   130         0          1      0      0        0
## 224    04_3    t00   127         0          1      0      1        0
## 788   12_29    t10   400         1          0      0      1        0
## 684   11_41    t10   377         0          1      1      0        0
## 1117 14_124    t10   279         0          1      0      1        0
## 727   11_84    t10   259         1          1      0      0        0
## 959   13_96    t10   233         0          0      1      0        0
## 1130 14_137    t10   208         0          1      0      0        0
## 885   13_22    t10   194         0          1      0      0        0
## 1318  16_38    t10   176         0          0      0      0        0
## 1029  14_36    t10   158         0          1      0      0        0
## 1044  14_51    t10   154         0          1      1      0        0
## 802   12_43    t10   153         0          0      0      1        1
## 683   11_40    t10   153         0          0      0      0        0
## 1575 17_148    t10   148         0          1      0      0        0
## 687   11_44    t10   148         0          1      1      0        0
## 699   11_56    t10   145         0          0      1      0        0
## 797   12_38    t10   141         1          1      1      1        0
## 1171  15_39    t10   139         0          1      1      0        0
## 984  13_121    t10   136         1          0      0      0        0
## 644    11_1    t10   134         0          1      1      0        0
## 1499  17_72    t10   129         0          1      0      1        1
## 1303  16_23    t10   128         0          1      1      1        0
## 990  13_127    t10   128         0          0      0      1        0
## 1951 19_186    t10   127         0          1      0      1        0
## 1638  18_40    t10   122         1          0      0      1        0
## 748  11_105    t10   121         1          1      1      0        0
## 951   13_88    t10   120         0          0      0      0        0
## 1013  14_20    t10   111         0          0      0      0        0
## 696   11_53    t10   107         0          1      0      0        0
## 714   11_71    t10   106         1          1      0      0        0
## 969  13_106    t10   105         0          1      0      0        0
## 1082  14_89    t10   104         1          1      1      0        0
## 2042  20_83    t10   103         1          0      0      0        0
## 1293  16_13    t10   103         0          0      0      0        0
## 1010  14_17    t10    98         0          0      0      0        0
## 1220  15_88    t10    96         1          0      0      0        0
## 1758 18_160    t10    95         1          0      0      0        0
## 1038  14_45    t10    92         0          0      0      0        0
## 1031  14_38    t10    92         0          1      1      0        0
## 1561 17_134    t10    88         1          1      0      0        0
## 1390 16_110    t10    87         0          1      1      0        0
## 1343  16_63    t10    84         0          0      0      0        0
## 1238 15_106    t10    84         0          0      1      0        0
## 811   12_52    t10    84         0          0      1      0        0
## 747  11_104    t10    84         0          1      0      0        0
## 1662  18_64    t10    83         0          1      0      0        0
## 1469  17_42    t10    82         0          0      0      0        0
## 1222  15_90    t10    82         1          1      0      0        0
## 1133   15_1    t10    81         1          1      0      1        0
## 777   12_18    t10    81         1          1      0      1        0
## 650    11_7    t10    81         0          0      0      0        0
## 1287   16_7    t10    80         0          1      0      0        0
## 1186  15_54    t10    79         0          0      0      0        0
## 716   11_73    t10    77         0          0      0      0        0
## 1714 18_116    t10    76         1          1      0      1        0
## 1126 14_133    t10    76         0          1      1      1        0
## 724   11_81    t10    76         0          0      0      0        0
## 662   11_19    t10    76         1          0      1      0        0
## 1306  16_26    t10    75         0          1      0      1        0
## 1179  15_47    t10    75         0          1      0      0        0
## 706   11_63    t10    74         0          1      0      0        0
## 653   11_10    t10    74         0          0      0      0        0
## 1122 14_129    t10    73         0          1      1      0        0
## 989  13_126    t10    73         1          1      0      1        0
## 1072  14_79    t10    70         1          0      0      1        0
## 1880 19_115    t10    69         0          1      0      0        0
## 1076  14_83    t10    67         1          0      0      0        0
## 1221  15_89    t10    66         0          1      0      0        0
## 858   12_99    t10    66         0          0      0      0        0
## 1428   17_1    t10    65         0          1      0      1        0
## 1055  14_62    t10    65         0          0      0      0        0
## 911   13_48    t10    65         0          0      0      0        0
## 766    12_7    t10    65         0          0      0      1        0
## 677   11_34    t10    63         1          1      1      0        0
## 1630  18_32    t10    62         0          0      0      0        0
## 1751 18_153    t10    61         0          0      0      1        0
## 1182  15_50    t10    61         0          1      0      0        0
## 1005  14_12    t10    61         1          0      1      1        0
## 781   12_22    t10    61         0          0      0      0        0
## 665   11_22    t10    61         1          1      0      1        0
## 1750 18_152    t10    60         0          0      0      0        0
## 759  11_116    t10    60         1          1      1      1        1
## 1731 18_133    t10    59         0          0      0      0        0
## 1509  17_82    t10    59         1          1      1      0        0
## 1083  14_90    t10    59         0          0      0      1        0
## 1023  14_30    t10    59         0          1      1      0        0
## 762    12_3    t10    59         0          1      0      1        0
## 1146  15_14    t10    58         0          1      0      0        0
## 1199  15_67    t10    57         1          0      0      1        0
## 745  11_102    t10    57         0          1      0      0        0
## 701   11_58    t10    57         0          0      1      0        0
## 1670  18_72    t10    55         1          0      0      0        0
## 1247 15_115    t10    55         1          0      0      0        0
## 1103 14_110    t10    55         1          0      0      0        0
## 1490  17_63    t10    54         0          1      0      0        0
## 1422 16_142    t10    54         0          0      0      0        0
## 1211  15_79    t10    53         0          1      1      1        0
## 1098 14_105    t10    53         0          1      0      0        0
## 1025  14_32    t10    53         0          1      1      0        0
## 717   11_74    t10    53         0          1      0      0        0
## 981  13_118    t10    52         0          0      0      1        1
## 963  13_100    t10    52         0          0      0      1        1
## 893   13_30    t10    52         1          0      0      0        0
## 667   11_24    t10    52         0          1      1      0        0
## 1475  17_48    t10    51         0          1      1      0        0
## 1152  15_20    t10    51         0          1      0      0        0
## 1030  14_37    t10    51         0          0      0      0        0
## 826   12_67    t10    50         0          0      0      0        0
## 695   11_52    t10    50         0          1      0      0        0
## 659   11_16    t10    50         0          1      0      0        0
## 1744 18_146    t10    49         1          1      1      1        0
## 1596 17_169    t10    49         1          0      0      1        0
## 1535 17_108    t10    49         0          1      1      1        0
## 1488  17_61    t10    49         0          0      0      0        1
## 875   13_12    t10    49         1          1      0      0        0
## 693   11_50    t10    49         0          0      0      0        0
## 1167  15_35    t10    48         0          1      1      0        0
## 1142  15_10    t10    48         0          0      0      1        0
## 1128 14_135    t10    48         1          0      0      1        0
## 1063  14_70    t10    48         0          1      0      1        0
## 1039  14_46    t10    48         0          0      0      0        0
## 892   13_29    t10    48         0          0      0      0        0
## 854   12_95    t10    48         0          0      0      0        0
## 829   12_70    t10    48         0          1      1      0        0
## 1286   16_6    t10    47         0          1      0      1        0
## 983  13_120    t10    47         0          0      0      1        0
## 775   12_16    t10    47         0          1      1      0        0
## 1162  15_30    t10    46         1          1      0      0        0
## 1232 15_100    t10    45         0          1      1      0        0
## 1193  15_61    t10    45         0          0      0      0        0
## 792   12_33    t10    45         0          1      0      1        1
## 1265 15_133    t10    44         0          0      0      0        0
## 1240 15_108    t10    44         1          0      0      0        0
## 1028  14_35    t10    44         0          0      0      0        1
## 902   13_39    t10    44         0          1      0      0        0
## 844   12_85    t10    44         0          1      0      1        0
## 1761 18_163    t10    43         0          1      0      0        0
## 1663  18_65    t10    43         0          0      0      1        1
## 1290  16_10    t10    43         0          1      0      0        0
## 1274 15_142    t10    43         0          1      0      1        0
## 1120 14_127    t10    43         1          1      0      0        0
## 1041  14_48    t10    43         0          0      0      1        0
## 1024  14_31    t10    43         0          1      0      1        1
## 916   13_53    t10    43         1          0      0      0        0
## 903   13_40    t10    43         0          1      0      0        0
## 883   13_20    t10    43         0          1      1      0        0
## 1828  19_63    t10    42         0          0      1      0        0
## 1770   19_5    t10    42         1          1      0      0        0
## 1734 18_136    t10    42         0          1      0      1        0
## 1278 15_146    t10    42         0          1      0      0        0
## 1243 15_111    t10    42         0          0      0      0        0
## 836   12_77    t10    42         0          1      0      0        0
## 2483 22_112    t20    52         1          0      0      1        0
## 2307 21_145    t20    43         0          1      0      1        0
## 2222  21_60    t20    41         0          0      0      0        0
## 2176  21_14    t20    28         0          1      0      1        0
## 2358 21_196    t20    27         0          1      0      1        0
## 2198  21_36    t20    20         0          1      0      0        0
## 2334 21_172    t20    19         0          1      0      0        0
## 2219  21_57    t20    18         0          1      1      1        0
## 2312 21_150    t20    17         1          1      0      0        0
## 2206  21_44    t20    17         0          0      0      1        0
## 2461  22_90    t20    16         0          0      0      0        1
## 2339 21_177    t20    16         0          0      0      0        0
## 2171   21_9    t20    16         0          0      0      1        0
## 2365 21_203    t20    15         0          0      0      0        0
## 2359 21_197    t20    15         0          0      0      0        0
## 2306 21_144    t20    15         0          1      1      0        0
## 2218  21_56    t20    15         0          0      0      0        1
## 2211  21_49    t20    15         0          0      0      0        0
## 2180  21_18    t20    14         1          1      0      1        0
## 2287 21_125    t20    13         0          0      0      1        1
## 2191  21_29    t20    13         0          1      0      1        0
## 2421  22_50    t20    12         0          1      0      0        0
## 2271 21_109    t20    12         0          0      0      1        0
## 2169   21_7    t20    12         0          1      0      1        0
## 2166   21_4    t20    12         0          1      0      0        1
## 2445  22_74    t20    11         0          1      0      0        0
## 2324 21_162    t20    11         1          1      0      0        0
## 2289 21_127    t20    11         1          1      0      1        0
## 2362 21_200    t20    10         0          1      0      1        0
## 2341 21_179    t20    10         1          0      0      0        0
## 2263 21_101    t20    10         0          1      0      1        0
## 2244  21_82    t20    10         0          0      1      1        0
## 2231  21_69    t20    10         0          1      0      0        0
## 2215  21_53    t20    10         1          1      1      0        0
## 2189  21_27    t20    10         1          1      0      0        0
## 2348 21_186    t20     9         0          0      0      0        0
## 2295 21_133    t20     9         1          1      1      0        0
## 2230  21_68    t20     9         1          1      0      0        0
## 2531 22_160    t20     8         1          1      0      0        0
## 2492 22_121    t20     8         0          1      0      0        0
## 2382  22_11    t20     8         0          1      0      0        0
## 2227  21_65    t20     8         0          0      0      1        1
## 2223  21_61    t20     8         0          0      0      0        1
## 2168   21_6    t20     8         0          0      0      0        0
## 2527 22_156    t20     7         0          0      0      0        0

# Top 10% most cited by themes
top10pct_cite_theme_df<- aggregate(top_10pct_cite_df[, names(top_10pct_cite_df) %in% varnames], by = list(decade = top_10pct_cite_df$decade), FUN = sum)
# Calculate the share
top10pct_cite_share_theme<- data.frame(
  cbind(decade = top10pct_cite_theme_df$decade, top10pct_cite_theme_df[, names(top10pct_cite_theme_df) %in% varnames] / top_10pct_cutoff)
  )
top10pct_cite_share_theme

##   decade  education employment    income    policy   politics
## 1    t80 0.00000000  0.0000000 0.0000000 0.0000000 0.00000000
## 2    t90 0.07518797  0.8270677 0.2255639 0.2255639 0.15037594
## 3    t00 0.18218623  0.6275304 0.2024291 0.2834008 0.04048583
## 4    t10 0.25674786  0.5398288 0.2369980 0.2830810 0.06583278
## 5    t20 0.24175824  0.5714286 0.1098901 0.3736264 0.13186813

tot_hi_cite_decade <- top_10pct_cite_df %>%
  group_by(decade) %>%
  summarise(top10pct = sum(cited))
# share of total citations of the top 10% most cited publications
df_top10pct_cite_share <- merge(tot_cite_decade,tot_hi_cite_decade, by = 'decade')
df_top10pct_cite_share$decade <- factor(df_top10pct_cite_share$decade, ordered = T, levels = c("t80", "t90", "t00", "t10", "t20"))
df_top10pct_cite_share <- df_top10pct_cite_share[order(df_top10pct_cite_share$decade),]
df_top10pct_cite_share$top10_share <- df_top10pct_cite_share$top10pct / df_top10pct_cite_share$total_cite

7.3 Conceptual Framework

For each theme, get at least two most representative (highly-cited) publications. Keep the article ids for further examination.

Get the vector/character version of the patterns to make pattern-based extraction easier. patterns_vector

#dat for author & publish year
#hi_cite for citation ranking
varnames # these are the themes as variable names

## [1] "education"  "employment" "income"     "policy"     "politics"

hi_cite_sort <- hi_cite[rev(order(hi_cite$cited)),] # sort citation descendingly
k = 5 # top k publications by theme
# the following works to index individual themes
#(hi_cite_sort$id[hi_cite_sort$education==1])[1:k]
#(hi_cite_sort$id[hi_cite_sort[,varnames[1]]==1])[1:k]

focus_review_id_list <- list() # no need to pre-define length. Otherwise, create unnecessary NULL items when filling list items with a name.
# loop over an empty list to fill with article ids for literature map
for (theme in varnames) {
  #print((hi_cite_sort$id[hi_cite_sort[,theme]==1])[1:k])
  focus_review_id_list[[theme]] <- (hi_cite_sort$id[hi_cite_sort[,theme]==1])[1:k]
}
# print out hand-made execution log
length(focus_review_id_list)

## [1] 5

paste("The top", k, "publication ids have been extracted for the", length(focus_review_id_list), "themes.")

## [1] "The top 5 publication ids have been extracted for the 5 themes."

## HOWEVER, what if I want to be more specific? Each pattern within theme, get the ids
#strsplit(pat_education, split = "|", fixed = T) # fixed = T is a must, otherwise get letters
#strsplit(pat_education, split = "|", fixed = T)[[1]] # must double-wrap [[]] to index
length(strsplit(pat_education, split = "|", fixed = T)[[1]]) # get length=how many patterns

## [1] 4

patterns_vector <- list()
for (patterns in names(patterns_l) ) {
  print(paste(patterns, "has the following patterns:"))
  print(length(strsplit(patterns_l[[patterns]], split = "|", fixed = T)[[1]]))
  print(strsplit(patterns_l[[patterns]], split = "|", fixed = T)[[1]])
  print(paste(strsplit(patterns_l[[patterns]], split = "|", fixed = T)[[1]], collapse = ","))
  patterns_vector[[patterns]] <- paste(strsplit(patterns_l[[patterns]], split = "|", fixed = T)[[1]], sep = ",")
}

## [1] "pat_education has the following patterns:"
## [1] 4
## [1] "education"  "learn"      "student"    "university"
## [1] "education,learn,student,university"
## [1] "pat_employ has the following patterns:"
## [1] 4
## [1] "employ" "labor"  "labour" "job"   
## [1] "employ,labor,labour,job"
## [1] "pat_income has the following patterns:"
## [1] 4
## [1] "wage"    "earning" "income"  "salary" 
## [1] "wage,earning,income,salary"
## [1] "pat_politics has the following patterns:"
## [1] 2
## [1] "politics"  "political"
## [1] "politics,political"
## [1] "pat_policy has the following patterns:"
## [1] 5
## [1] "policy"     "policies"   "regulation" "rule"       " law"      
## [1] "policy,policies,regulation,rule, law"

# There's the need to create a vector of patterns for each theme, connected by ,
# e.g. ["education","learn","student","university"]
names(patterns_vector)

## [1] "pat_education" "pat_employ"    "pat_income"    "pat_politics" 
## [5] "pat_policy"

names(patterns_vector) <- gsub("pat_","",names(patterns_vector)) # rename the list elements to be consistent with theme names. Note here the order of the varnames are different, so I cannot assign it to varnames directly
names(patterns_vector)[names(patterns_vector)=="employ"] <- "employment"
patterns_vector

## $education
## [1] "education"  "learn"      "student"    "university"
## 
## $employment
## [1] "employ" "labor"  "labour" "job"   
## 
## $income
## [1] "wage"    "earning" "income"  "salary" 
## 
## $politics
## [1] "politics"  "political"
## 
## $policy
## [1] "policy"     "policies"   "regulation" "rule"       " law"

dat_text <- dat[,c("id","Author Full Names","Publication Year", "text", "Times Cited, WoS Core")]
# easier way to group rename columns: rename("new" = "old", "new" = "old")
dat_text <- dat_text %>% rename("author" = "Author Full Names", "year" = "Publication Year", "cited" = "Times Cited, WoS Core")
dat_text <- dat_text[rev(order(dat_text$cited)),]
# test on individual items
(dat_text$id[grep(pattern = patterns_vector[['employment']][1], dat_text$text)])[1:k]

## [1] "01_9"  "07_11" "10_2"  "03_25" "01_20"

# create an empty list to contain the ids
lit_map_list <- list()
# outer
for (theme in names(patterns_vector)) {
  # inner
  lit_map_list[[theme]] <- list()
  for (i in 1: length(patterns_vector[[theme]])) {
    print(patterns_vector[[theme]][i])
    print((dat_text$id[grep(pattern = (patterns_vector[[theme]])[i], dat_text$text)])[1:k])
    print((dat_text$cited[grep(pattern = (patterns_vector[[theme]])[i], dat_text$text)])[1:k])
    print(paste((dat_text$author[grep(pattern = (patterns_vector[[theme]])[i], dat_text$text)])[1:k], (dat_text$year[grep(pattern = (patterns_vector[[theme]])[i], dat_text$text)])[1:k], sep = ", "))
    print("----------")
    # append ids to list/vector
    lit_map_list[[theme]][[i]] <- dat_text$id[grep(pattern = (patterns_vector[[theme]])[i], dat_text$text)][1:k]
  }
}

## [1] "education"
## [1] "07_11" "12_29" "08_34" "05_9"  "01_4" 
## [1] 524 400 376 360 293
## [1] "Hainmueller, Jens; Hiscox, Michael J., 2007"              
## [2] "Docquier, Frederic; Rapoport, Hillel, 2012"               
## [3] "Beine, Michel; Docquier, Frederic; Rapoport, Hillel, 2008"
## [4] "Card, D, 2005"                                            
## [5] "Iredale, R, 2001"                                         
## [1] "----------"
## [1] "learn"
## [1] "10_89"  "10_80"  "14_89"  "10_101" "07_18" 
## [1] 144 109 104  94  85
## [1] "Kennedy, G.; Judd, T.; Dalgarno, B.; Waycott, J., 2010"
## [2] "Amit, Karin, 2010"                                     
## [3] "Parutis, Violetta, 2014"                               
## [4] "Benzie, Helen Joy, 2010"                               
## [5] "Williams, Allan M., 2007"                              
## [1] "----------"
## [1] "student"
## [1] "10_89"  "18_40"  "08_36"  "05_19"  "18_160"
## [1] 144 122 120 120  95
## [1] "Kennedy, G.; Judd, T.; Dalgarno, B.; Waycott, J., 2010"      
## [2] "de Haas, Hein; Natter, Katharina; Vezzoli, Simona, 2018"     
## [3] "Chellaraj, Gnanaraj; Maskus, Keith E.; Mattoo, Aaditya, 2008"
## [4] "Hawthorne, L, 2005"                                          
## [5] "Lulle, Aija; Morosanu, Laura; King, Russell, 2018"           
## [1] "----------"
## [1] "university"
## [1] "10_89"  "08_36"  "20_83"  "12_18"  "10_106"
## [1] 144 120 103  81  75
## [1] "Kennedy, G.; Judd, T.; Dalgarno, B.; Waycott, J., 2010"        
## [2] "Chellaraj, Gnanaraj; Maskus, Keith E.; Mattoo, Aaditya, 2008"  
## [3] "Dong, Xiaofang; Zheng, Siqi; Kahn, Matthew E., 2020"           
## [4] "Stuen, Eric T.; Mobarak, Ahmed Mushfiq; Maskus, Keith E., 2012"
## [5] "Venhorst, Viktor; van Dijk, Jouke; van Wissen, Leo, 2010"      
## [1] "----------"
## [1] "employ"
## [1] "01_9"  "07_11" "10_2"  "03_25" "01_20"
## [1] 831 524 265 256 221
## [1] "Card, D, 2001"                              
## [2] "Hainmueller, Jens; Hiscox, Michael J., 2007"
## [3] "Kerr, William R.; Lincoln, William F., 2010"
## [4] "Bauder, H, 2003"                            
## [5] "Nee, V; Sanders, J, 2001"                   
## [1] "----------"
## [1] "labor"
## [1] "01_9"  "06_18" "01_17" "09_68" "10_29"
## [1] 831 620 589 537 525
## [1] "Card, D, 2001"                              
## [2] "Mayda, Anna Maria, 2006"                    
## [3] "Scheve, KF; Slaughter, MJ, 2001"            
## [4] "Storper, Michael; Scott, Allen J., 2009"    
## [5] "Hainmueller, Jens; Hiscox, Michael J., 2010"
## [1] "----------"
## [1] "labour"
## [1] "06_9"  "05_9"  "05_21" "01_4"  "03_25"
## [1] 394 360 313 293 256
## [1] "Bettio, Francesca; Simonazzi, Annamaria; Villa, Paola, 2006"
## [2] "Card, D, 2005"                                              
## [3] "Beaverstock, JV, 2005"                                      
## [4] "Iredale, R, 2001"                                           
## [5] "Bauder, H, 2003"                                            
## [1] "----------"
## [1] "job"
## [1] "09_68" "07_11" "94_8"  "11_84" "01_20"
## [1] 537 524 263 259 221
## [1] "Storper, Michael; Scott, Allen J., 2009"    
## [2] "Hainmueller, Jens; Hiscox, Michael J., 2007"
## [3] "HAMNETT, C, 1994"                           
## [4] "Oreopoulos, Philip, 2011"                   
## [5] "Nee, V; Sanders, J, 2001"                   
## [1] "----------"
## [1] "wage"
## [1] "01_9"  "01_17" "07_11" "11_41" "05_10"
## [1] 831 589 524 377 351
## [1] "Card, D, 2001"                              
## [2] "Scheve, KF; Slaughter, MJ, 2001"            
## [3] "Hainmueller, Jens; Hiscox, Michael J., 2007"
## [4] "Grogger, Jeffrey; Hanson, Gordon H., 2011"  
## [5] "Chiquiar, D; Hanson, GH, 2005"              
## [1] "----------"
## [1] "earning"
## [1] "03_23" "11_41" "05_10" "91_2"  "08_46"
## [1] 423 377 351 222 109
## [1] "Edin, PA; Fredriksson, P; Aslund, O, 2003"
## [2] "Grogger, Jeffrey; Hanson, Gordon H., 2011"
## [3] "Chiquiar, D; Hanson, GH, 2005"            
## [4] "CHISWICK, BR, 1991"                       
## [5] "Chiswick, Barry R.; Miller, Paul W., 2008"
## [1] "----------"
## [1] "income"
## [1] "07_11" "03_23" "11_41" "94_8"  "09_19"
## [1] 524 423 377 263 212
## [1] "Hainmueller, Jens; Hiscox, Michael J., 2007"
## [2] "Edin, PA; Fredriksson, P; Aslund, O, 2003"  
## [3] "Grogger, Jeffrey; Hanson, Gordon H., 2011"  
## [4] "HAMNETT, C, 1994"                           
## [5] "Facchini, Giovanni; Mayda, Anna Maria, 2009"
## [1] "----------"
## [1] "salary"
## [1] "10_80" "04_29" "07_40" "14_12" "10_22"
## [1] 109 100  95  61  58
## [1] "Amit, Karin, 2010"                        
## [2] "Ackers, L, 2004"                          
## [3] "Kahn, Lawrence M., 2007"                  
## [4] "Kofman, Eleonore, 2014"                   
## [5] "Mithas, Sunil; Lucas, Henry C., Jr., 2010"
## [1] "----------"
## [1] "politics"
## [1] "13_24" "17_86" "22_55" NA      NA     
## [1] 29  2  1 NA NA
## [1] "van Riemsdijk, Micheline, 2013"       
## [2] "Tomei, Gabriele, 2017"                
## [3] "Martinez, Cesar Augusto Ferrari, 2022"
## [4] "NA, NA"                               
## [5] "NA, NA"                               
## [1] "----------"
## [1] "political"
## [1] "01_4"  "00_1"  "12_43" "96_13" "07_16"
## [1] 293 185 153 152 138
## [1] "Iredale, R, 2001"                                            
## [2] "Mahroum, S, 2000"                                            
## [3] "de Haas, Hein, 2012"                                         
## [4] "Chan, KW, 1996"                                              
## [5] "Docquier, Frederic; Lohest, Olivier; Marfouk, Abdeslam, 2007"
## [1] "----------"
## [1] "policy"
## [1] "01_17"  "03_23"  "12_29"  "14_124" "02_8"  
## [1] 589 423 400 279 218
## [1] "Scheve, KF; Slaughter, MJ, 2001"           
## [2] "Edin, PA; Fredriksson, P; Aslund, O, 2003" 
## [3] "Docquier, Frederic; Rapoport, Hillel, 2012"
## [4] "Xiang, Biao; Lindquist, Johan, 2014"       
## [5] "Saxenian, A, 2002"                         
## [1] "----------"
## [1] "policies"
## [1] NA NA NA NA NA
## [1] NA NA NA NA NA
## [1] "NA, NA" "NA, NA" "NA, NA" "NA, NA" "NA, NA"
## [1] "----------"
## [1] "regulation"
## [1] "17_72" "04_3"  "96_3"  "16_26" "10_43"
## [1] 129 127  83  75  54
## [1] "Salmela, Mikko; von Scheve, Christian, 2017"                                                                                                                         
## [2] "Raghuram, P, 2004"                                                                                                                                                   
## [3] "Beaverstock, JV; Smith, J, 1996"                                                                                                                                     
## [4] "Beine, Michel; Boucher, Anna; Burgoon, Brian; Crock, Mary; Gest, Justin; Hiscox, Michael; McGovern, Patrick; Rapoport, Hillel; Schaper, Joep; Thielemann, Eiko, 2016"
## [5] "Kaur, Amarjit, 2010"                                                                                                                                                 
## [1] "----------"
## [1] "rule"
## [1] "10_2"   "15_79"  "15_142" "14_67"  "01_12" 
## [1] 265  53  43  24  24
## [1] "Kerr, William R.; Lincoln, William F., 2010"                                                                                                                                                                  
## [2] "Labonte, Ronald; Sanders, David; Mathole, Thubelihle; Crush, Jonathan; Chikanda, Abel; Dambisya, Yoswa; Runnels, Vivien; Packer, Corinne; MacKenzie, Adrian; Murphy, Gail Tomblin; Bourgeault, Ivy Lynn, 2015"
## [3] "Manzanilla, Linda R., 2015"                                                                                                                                                                                   
## [4] "Cerna, Lucie, 2014"                                                                                                                                                                                           
## [5] "Peixoto, J, 2001"                                                                                                                                                                                             
## [1] "----------"
## [1] " law"
## [1] "05_2"   "07_40"  "00_7"   "99_1"   "17_169"
## [1] 111  95  65  61  49
## [1] "Purkayastha, B, 2005"           "Kahn, Lawrence M., 2007"       
## [3] "Adepoju, A, 2000"               "Iredale, R, 1999"              
## [5] "Zhou, Min; Lee, Jennifer, 2017"
## [1] "----------"

lit_map_list

## $education
## $education[[1]]
## [1] "07_11" "12_29" "08_34" "05_9"  "01_4" 
## 
## $education[[2]]
## [1] "10_89"  "10_80"  "14_89"  "10_101" "07_18" 
## 
## $education[[3]]
## [1] "10_89"  "18_40"  "08_36"  "05_19"  "18_160"
## 
## $education[[4]]
## [1] "10_89"  "08_36"  "20_83"  "12_18"  "10_106"
## 
## 
## $employment
## $employment[[1]]
## [1] "01_9"  "07_11" "10_2"  "03_25" "01_20"
## 
## $employment[[2]]
## [1] "01_9"  "06_18" "01_17" "09_68" "10_29"
## 
## $employment[[3]]
## [1] "06_9"  "05_9"  "05_21" "01_4"  "03_25"
## 
## $employment[[4]]
## [1] "09_68" "07_11" "94_8"  "11_84" "01_20"
## 
## 
## $income
## $income[[1]]
## [1] "01_9"  "01_17" "07_11" "11_41" "05_10"
## 
## $income[[2]]
## [1] "03_23" "11_41" "05_10" "91_2"  "08_46"
## 
## $income[[3]]
## [1] "07_11" "03_23" "11_41" "94_8"  "09_19"
## 
## $income[[4]]
## [1] "10_80" "04_29" "07_40" "14_12" "10_22"
## 
## 
## $politics
## $politics[[1]]
## [1] "13_24" "17_86" "22_55" NA      NA     
## 
## $politics[[2]]
## [1] "01_4"  "00_1"  "12_43" "96_13" "07_16"
## 
## 
## $policy
## $policy[[1]]
## [1] "01_17"  "03_23"  "12_29"  "14_124" "02_8"  
## 
## $policy[[2]]
## [1] NA NA NA NA NA
## 
## $policy[[3]]
## [1] "17_72" "04_3"  "96_3"  "16_26" "10_43"
## 
## $policy[[4]]
## [1] "10_2"   "15_79"  "15_142" "14_67"  "01_12" 
## 
## $policy[[5]]
## [1] "05_2"   "07_40"  "00_7"   "99_1"   "17_169"

dat$Abstract[dat$id == "01_9"]

## [1] "This article uses 1990 census data to study the effects of immigrant inflows on occupation-specific labor market outcomes. I find that intercity mobility rates of natives and earlier immigrants are insensitive to immigrant inflows. However, occupation-specific wages and employment rates are systematically lower in cities with higher relative supplies of workers in a given occupation. The results imply that immigrant inflows over the 1980s reduced wages and employment rates of low-skilled natives in traditional gateway cities like Miami and Los Angeles by 1-3 percentage points."

dat$Abstract[dat$id == "06_18"]

## [1] "This paper empirically analyzes economic and noneconomic determinants of individual attitudes toward immigrants, within and across countries. The two survey data sets used, covering a wide range developed and developing countries, make it possible to test for interactive effects between individual characteristics and country-level attributes. In particular, theory predicts that the correlation between pro-immigration attitudes and individual skill should be related to the skill composition natives relative to immigrants in the destination country. Skilled individuals should favor immigration in countries where natives are more skilled than immigrants and oppose it otherwise. Results based on direct and indirect measures of the relative skill composition are consistent with these predictions. Noneconomic variables also are correlated with immigration attitudes, but they don't alter significantly the labor-market results."

dat$Abstract[dat$id == "01_17"]

## [1] "This paper uses three years of individual-level data to analyze the determinants of individual preferences over immigration policy in the United States. We have two main empirical results. First, less-skilled workers are significantly more likely to prefer limiting immigrant inflows into the United States. Our finding suggests that, over the time horizons that are relevant to individuals when evaluating immigration policy, individuals think that the U.S, economy absorbs immigrant inflows at least partly by changing wages. Second, we find no evidence that the relationship between skills and immigration opinions is stronger in high-immigration communities."

dat$Abstract[dat$id == "09_68"]

## [1] "Do jobs follow people or do people follow jobs A number of currently prominent approaches to urbanization respond to this question by privileging the role of individual locational choice in response to amenity values as the motor of contemporary urban growth. Amenities, it is often said, have an especially potent effect on the migration patterns of individuals endowed with high levels of human capital. However, these approaches raise many unanswered questions. Theories that describe urban growth as a response to movements of people in search of consumer or lifestyle preferences can be questioned on the grounds of their assumptions about human behavior, as well as their silence in regard to the geographical dynamics of production and work. We argue that a more effective line of explanation must relate urban growth directly to the economic geography of production and must explicitly deal with the complex recursive interactions between the location of firms and the movements of labor. In this context, we also offer a reinterpretation of the currently fashionable notions of creativity and the role of skilled labor in cities."

8 Save Data

# last updated on 08/27/2023 by removing pol = policy + politics
write_xlsx(list("data" = dat,"theme_freq" = text_colsum, "cat_freq" = cat_freq, "cat_id" = cat_id, "dcd_cat"= dec_cat, "key_cat" = key_cat, "theme_dec" = theme_dec,"theme_pt_dec" = theme_dec_t_pct, "theme_yr" = theme_yr, "keyword" = keyword, "key_freq" = keyword_freq, "kw_binary" = keyword_bi, "txt_binary" = text_bi, "kw_colsum" = keyword_colsum, "year_cat" = year_cat, "year_ct" = year_ct, "theme_cat_dec" = theme_dec_cat_df, 'top10pct_cite_ct' = top10pct_cite_theme_df, 'top10pct_cite_pct' = top10pct_cite_share_theme), "mig_analysis.xlsx")

Sys.time()

## [1] "2023-08-30 23:01:31 EDT"

Skilled Migration Literature Review

Tiangeng Lu

First created on 2023-04-25. Updated on 2023-08-30