knitr::opts_chunk$set(echo = TRUE)
options(tibble.width = Inf)

Library

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyr)
library(ggplot2)
library(tidygeocoder)

## Warning: package 'tidygeocoder' was built under R version 4.4.1

library(purrr)

## Warning: package 'purrr' was built under R version 4.4.1

library(leaflet)
library(leaflet.extras)

## Warning: package 'leaflet.extras' was built under R version 4.4.1

library(httr) 
library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(sf)

## Warning: package 'sf' was built under R version 4.4.1

## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

library(rnaturalearth)

## Warning: package 'rnaturalearth' was built under R version 4.4.1

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Sys.setenv(GEMINI_API_KEY = "AIzaSyCxbDGILqfXqMyjQ6QbGZHdYUJC_NdNOFg")
gemini_endpoint <- "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"

Descriptive Function

quick_numeric_stats <- function(data, column) {
  vec <- data %>% pull({{ column }})
  if (!is.numeric(vec)) stop("Column must be numeric")

  vec_non_na <- vec[!is.na(vec)]

  tibble(
    variable = as_label(enquo(column)),
    total    = length(vec),
    missing  = sum(is.na(vec)),
    min      = min(vec_non_na),
    q1       = quantile(vec_non_na, 0.25),
    median   = median(vec_non_na),
    mean     = mean(vec_non_na),
    q3       = quantile(vec_non_na, 0.75),
    max      = max(vec_non_na),
    sd       = sd(vec_non_na)
  )
}

Import Raw Dataset

dissertations <- read_csv("dissertations-clean.csv", show_col_types = FALSE)
# print total rows of raw data
total_rows <- nrow(dissertations)
cat("Total rows: ", total_rows, "\n")

## Total rows:  5471940

# count all missing value for degree/degreelevel/for both
dissertations %>%  
  summarise(
    total = n(),
    
    missing_degree = sum(
      is.na(degree) |
      str_trim(degree) == "" |
      degree == "[None]"
    ),
    pct_missing_degree = missing_degree / total,
    
    missing_degreelevel = sum(
      is.na(degreelevel) |
      str_trim(degreelevel) == "" |
      degreelevel == "[None]"
    ),
    pct_missing_degreelevel = missing_degreelevel / total,
    
    missing_either = sum(
      (is.na(degree)      | str_trim(degree) == ""      | degree == "[None]") |
      (is.na(degreelevel) | str_trim(degreelevel) == "" | degreelevel == "[None]")
    ),
    pct_missing_either = missing_either / total
  ) %>%
  print()

## # A tibble: 1 × 7
##     total missing_degree pct_missing_degree missing_degreelevel
##     <int>          <int>              <dbl>               <int>
## 1 5471940          94968             0.0174                   0
##   pct_missing_degreelevel missing_either pct_missing_either
##                     <dbl>          <int>              <dbl>
## 1                       0          94968             0.0174

Filling the [None] degree using degree level

dissertations %>%
  filter(degree == "[None]") %>%
  distinct(degreelevel) %>%
  arrange(degreelevel) %>%
  write_csv("degreelevel_to_degree_mapping.csv")

mapping <- read_csv("degreelevel_to_degree_mapping_marked.csv")

## Rows: 117 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): degreelevel, mapped_degree
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dissertations_cleaned <- dissertations %>%
  left_join(mapping, by = "degreelevel") %>%
  mutate(
    degree = if_else(
      degree %in% c("[None]") & !is.na(mapped_degree),
      mapped_degree,
      degree
    )
  )

# Check how many still [None]
dissertations_cleaned %>%
  summarise(
    total = n(),
    still_none_degree = sum(degree == "[None]" | is.na(degree)),
    still_none_degreelevel = sum(degreelevel == "[None]" | is.na(degreelevel))
  )

## # A tibble: 1 × 3
##     total still_none_degree still_none_degreelevel
##     <int>             <int>                  <int>
## 1 5471940                 0                      0

Identify the Doctorate

# print all degree and degreelevel name to see possible keywords
dissertations_cleaned %>% 
  distinct(degree) %>% 
  arrange(degree) %>% 
  write_csv("all_degree_values_raw.csv")

dissertations_cleaned %>% 
  distinct(degreelevel) %>% 
  arrange(degreelevel) %>% 
  write_csv("all_degreelevel_values_raw.csv")

# import two hand-tagged keywords mapping files

deg_map <- read_csv(
  "all_degree_values_marked.csv",
  col_types = cols(
    degree      = col_character(),
    is_doc      = col_logical()
  )
) %>%
  rename(is_doctorate_deg = is_doc)

lvl_map <- read_csv(
  "all_degreelevel_values_marked.csv",
  col_types = cols(
    degreelevel      = col_character(),
    is_doc           = col_logical()
  )
) %>% 
  rename(is_doctorate_level = is_doc)
clean_text <- function(x) stringr::str_squish(stringr::str_replace_all(x, "\u00A0", " "))

deg_map_one <- deg_map %>%
  dplyr::mutate(degree = clean_text(degree)) %>%
  dplyr::group_by(degree) %>%
  dplyr::summarise(
    is_doctorate_deg = any(is_doctorate_deg, na.rm = TRUE),
    .groups = "drop"
  )

lvl_map_one <- lvl_map %>%
  dplyr::mutate(degreelevel = clean_text(degreelevel)) %>%
  dplyr::group_by(degreelevel) %>%
  dplyr::summarise(
    is_doctorate_level = any(is_doctorate_level, na.rm = TRUE),
    .groups = "drop"
  )

dissertations_flagged <- dissertations_cleaned %>%
  dplyr::mutate(
    degree      = clean_text(degree),
    degreelevel = clean_text(degreelevel)
  ) %>%
  dplyr::left_join(deg_map_one, by = "degree",      relationship = "many-to-one") %>%
  dplyr::left_join(lvl_map_one, by = "degreelevel", relationship = "many-to-one") %>%
  dplyr::mutate(
    is_doctorate_deg   = coalesce(is_doctorate_deg,   FALSE),
    is_doctorate_level = coalesce(is_doctorate_level, FALSE),
    by_degree_only     = is_doctorate_deg   & !is_doctorate_level,
    by_level_only      = !is_doctorate_deg &  is_doctorate_level,
    by_both            = is_doctorate_deg   &  is_doctorate_level,
    is_doctorate       = by_both
  )

doctorates <- dissertations_flagged %>%
  dplyr::filter(is_doctorate) %>%
  dplyr::distinct()
  
total_dissertations <- nrow(dissertations_cleaned) 
num_doctorates <- nrow(doctorates)
pct_doctorates <- num_doctorates / total_dissertations

coverage_stats <- dissertations_flagged %>%
  dplyr::filter(
    !is.na(degree),      stringr::str_trim(degree)      != "", degree      != "[None]",
    !is.na(degreelevel), stringr::str_trim(degreelevel) != "", degreelevel != "[None]"
  ) %>%
  dplyr::distinct() %>%
  dplyr::summarise(
    n_is_degree = sum(by_degree_only),
    n_is_level  = sum(by_level_only),
    n_both      = sum(by_both)
  )

cat("Original total rows:", total_dissertations, "\n")

## Original total rows: 5471940

cat("STRICT (both) doctorates:", num_doctorates, " / ", total_dissertations,
    " (", sprintf('%.2f%%', 100*pct_doctorates), ")\n\n", sep = "")

## STRICT (both) doctorates:3900167 / 5471940 (71.28%)

print(coverage_stats)

## # A tibble: 1 × 3
##   n_is_degree n_is_level  n_both
##         <int>      <int>   <int>
## 1           4         29 3900167

write_csv(doctorates, "dissertations-doctorates.csv")

# degree-only rows
degree_only_rows <- dissertations_flagged %>%
  filter(by_degree_only) %>%
  distinct()

write_csv(degree_only_rows, "doctorates-degree-only.csv")

# level-only rows
level_only_rows <- dissertations_flagged %>%
  filter(by_level_only) %>%
  distinct()

write_csv(level_only_rows, "doctorates-level-only.csv")
# after revision, all degree and degreelevel left are unmatched. ASK USE BOTH or OR!

Publish date analysis

# descriptive stats: No missing value
dc <- read_csv("dissertations-doctorates.csv", show_col_types = FALSE)

dc <- dc %>%
  mutate(
    publication_date = as.Date(publishedate),
    pub_year  = as.integer(format(publication_date, "%Y")),
    pub_month = as.integer(format(publication_date, "%m")),
    pub_day   = as.integer(format(publication_date, "%d")),
  )

pub_year_stats <- quick_numeric_stats(dc, pub_year) %>%
  mutate(missing_pct = round(100 * missing / total, 2))

print(pub_year_stats)

## # A tibble: 1 × 11
##   variable   total missing   min    q1 median  mean    q3   max    sd
##   <chr>      <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 pub_year 3900167       0  1950  1983   2000 1996.  2012  2023  17.9
##   missing_pct
##         <dbl>
## 1           0

# data validity test: All data are good
dc <- dc %>%
  mutate(
    invalid_month = !is.na(pub_month) & !(pub_month %in% 1:12),
    invalid_day   = !is.na(pub_day)   & !(pub_day   %in% 1:31),
    parse_fail    = is.na(publication_date) 
  )

date_validity <- dc %>%
  summarise(
    n_total           = n(),
    parse_fail_n      = sum(parse_fail),
    invalid_month_n   = sum(invalid_month),
    invalid_day_n     = sum(invalid_day)
  )
print(date_validity)

## # A tibble: 1 × 4
##   n_total parse_fail_n invalid_month_n invalid_day_n
##     <int>        <int>           <int>         <int>
## 1 3900167            0               0             0

# year distribution
ggplot(dc %>% filter(!is.na(pub_year)), aes(x = pub_year)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Dissertations by Publication Year",
       x = "Publication Year", y = "Count")

Degree year analysis

# Doctorate degree years should be four-digit numeric values (YYYY).  
doctorates <- doctorates %>%
  mutate(degreeyear = suppressWarnings(as.integer(degreeyear)))

# total 30 rows that are not 4 digits
invalid_rows <- doctorates %>%
  filter(is.na(degreeyear) | !str_detect(as.character(degreeyear), "^\\d{4}$"))

write_csv(invalid_rows, "degreeyear_invalid_rows.csv")

# valid 4 digits
doctorates_valid4 <- doctorates %>%
  filter(!is.na(degreeyear) & str_detect(as.character(degreeyear), "^\\d{4}$"))

# 1600-2025 distribution
print( quick_numeric_stats(doctorates_valid4, degreeyear) )

## # A tibble: 1 × 10
##   variable     total missing   min    q1 median  mean    q3   max    sd
##   <chr>        <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900137       0  1605  1983   2000 1996.  2012  2023  17.9

ggplot(doctorates_valid4, aes(degreeyear)) +
  geom_histogram(binwidth = 1, boundary = 0, closed = "left",
                 fill = "blue", colour = "grey") +
  scale_x_continuous(breaks = seq(1600, 2025, 25), limits = c(1600, 2025)) +
  labs(title = "Doctorate Degree Years (All 4-digit)", x = "Degree Year", y = "Count") +
  theme_minimal()

# <1950 distribution
doctorates_valid4 %>%
  filter(degreeyear < 1950) %>%
  summarise(n = n(), min_year = min(degreeyear), max_year = max(degreeyear)) %>%
  print()

## # A tibble: 1 × 3
##       n min_year max_year
##   <int>    <int>    <int>
## 1    52     1605     1949

# filter 1950–2023
doctorates_clean <- doctorates_valid4 %>%
  filter(degreeyear >= 1950, degreeyear <= 2023)

print( quick_numeric_stats(doctorates_clean, degreeyear) )

## # A tibble: 1 × 10
##   variable     total missing   min    q1 median  mean    q3   max    sd
##   <chr>        <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900085       0  1950  1983   2000 1996.  2012  2023  17.9

ggplot(doctorates_clean, aes(degreeyear)) +
  geom_histogram(binwidth = 5, boundary = 0, closed = "left",
                 fill = "blue", colour = "grey") +
  scale_x_continuous(breaks = seq(1950, 2025, 5), limits = c(1950, 2025)) +
  labs(title = "Doctorate Degree Years (1950–2023, 5-year bins)",
       x = "Degree Year", y = "Count") +
  theme_minimal()

write_csv(doctorates_clean, "doctorates_1950_2023.csv")

Degree vs Publish Year Analysis

doctorates_clean <- readr::read_csv("dissertations-doctorates.csv", show_col_types = FALSE)

year_min <- 1950; year_max <- 2023

dc <- doctorates_clean %>%
  mutate(
    pub_year = year(publishedate),
    degreeyear_int = suppressWarnings(as.integer(degreeyear))
  )

# coverage table
coverage <- dc %>%
  dplyr::summarise(
    n_total     = dplyr::n(),
    n_deg_valid = sum(stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
                      as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE),
    n_pub_valid = sum(!is.na(pub_year), na.rm = TRUE),
    n_both      = sum(!is.na(pub_year) &
                      stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
                      as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE)
  ) %>%
  dplyr::mutate(
    pct_deg_valid = n_deg_valid / n_total,
    pct_pub_valid = n_pub_valid / n_total,
    pct_both      = n_both      / n_total
  )
print(coverage)

## # A tibble: 1 × 7
##   n_total n_deg_valid n_pub_valid  n_both pct_deg_valid pct_pub_valid pct_both
##     <int>       <int>       <int>   <int>         <dbl>         <dbl>    <dbl>
## 1 3900167     3900085     3900167 3900085          1.00             1     1.00

readr::write_csv(coverage, "coverage_degree_vs_publish.csv")

# compute lag = publish year − degree year, usually publish year >= degree year
lag_df <- dc %>%
  dplyr::mutate(degreeyear = suppressWarnings(as.integer(degreeyear))) %>%
  dplyr::filter(
    !is.na(pub_year),
    !is.na(degreeyear),
    dplyr::between(degreeyear, year_min, year_max)
  ) %>%
  dplyr::mutate(year_diff = pub_year - degreeyear)

cat("Lag sample size (both present, degree ", year_min, "–", year_max, "): ",
    nrow(lag_df), "\n", sep = "")

## Lag sample size (both present, degree 1950–2023): 3900085

# basic stats
lag_num <- quick_numeric_stats(lag_df, year_diff)

lag_counts <- lag_df %>%
  dplyr::summarise(
    n_valid     = dplyr::n(),
    n_same_year = sum(year_diff == 0),
    n_before    = sum(year_diff < 0),
    n_after     = sum(year_diff > 0)
  ) %>%
  dplyr::mutate(
    share_same  = n_same_year / n_valid,
    share_before= n_before    / n_valid,
    share_after = n_after     / n_valid
  )

lag_summary <- dplyr::bind_cols(lag_num, lag_counts)
print(lag_summary)

## # A tibble: 1 × 17
##   variable    total missing   min    q1 median   mean    q3   max    sd n_valid
##   <chr>       <int>   <int> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
## 1 year_diff 3900085       0   -51     0      0 0.0150     0    61 0.626 3900085
##   n_same_year n_before n_after share_same share_before share_after
##         <int>    <int>   <int>      <dbl>        <dbl>       <dbl>
## 1     3887833     1842   10410      0.997     0.000472     0.00267

readr::write_csv(lag_summary, "lag_summary_and_shares.csv")

# histogram of lag
zoom_xlim <- c(-3, 5)  
ggplot(lag_df, aes(x = year_diff)) +
  geom_histogram(binwidth = 1, boundary = 0, closed = "left",
                 aes(y = after_stat(..count.. / sum(..count..)))) +
  coord_cartesian(xlim = zoom_xlim) +
  labs(title = paste0("Lag = publish year − degree year (zoom ", zoom_xlim[1], "…", zoom_xlim[2], ")"),
       x = "Years", y = "Share of dissertations") +
  theme_minimal()

# outliers (publish >5y after OR >1y before degree)
neg_outlier <- -1; pos_outlier <- 5 
outliers <- lag_df %>%
  dplyr::filter(year_diff < neg_outlier | year_diff > pos_outlier) %>%
  dplyr::arrange(dplyr::desc(abs(year_diff)))
readr::write_csv(outliers, "outliers_lag.csv")
cat("Outliers flagged: ", nrow(outliers),
    " (criteria: <", neg_outlier, " or >", pos_outlier, ")\n", sep = "")

## Outliers flagged: 2488 (criteria: <-1 or >5)

Make crosswalk for degrees

Need adjust for new degree

deg_unique <- doctorates_clean %>%            
  distinct(degree) %>% 
  pull()
cat("Unique degree variants:", length(deg_unique), "\n")

## Unique degree variants: 389

#doctorates_clean <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
# 
# gemini_map <- function(deg_vec) {
# 
# # prompt
#   prompt <- paste(
#     "You are a data-cleaning assistant.",
#     "OUTPUT must be ONLY a JSON array, no preface, no markdown fences.",
#     "Each object needs keys 'original' and 'standard'.",
#     "- 'standard' = full formal degree name (e.g. 'Doctor of Philosophy').",
#     "- Do NOT return abbreviations like PhD, EdD, DBA, etc.",
#     "- If the original is already formal, copy it to 'standard'.",
#     "- Keep the list order.",
#     "\n\n",
#     paste0(seq_along(deg_vec), ". ", deg_vec, collapse = "\n")
#   )
# 
#   body <- list(
#     contents = list(list(parts = list(list(text = prompt))))
#   )
# 
#   resp <- POST(
#     gemini_endpoint,
#     body   = toJSON(body, auto_unbox = TRUE),
#     encode = "json",
#     add_headers(
#       `Content-Type`   = "application/json",
#       `X-goog-api-key` = Sys.getenv("GEMINI_API_KEY")
#     )
#   )
#   stop_for_status(resp)
# 
#   txt <- fromJSON(content(resp, "text", encoding = "UTF-8"),
#                   simplifyVector = FALSE)$candidates[[1]]$content$parts[[1]]$text
# 
#   json_str <- txt |>
#     str_extract("(?s)\\[.*\\]") |>
#     str_trim()
# 
#   as_tibble(fromJSON(json_str))
# }

# chunks <- split(deg_unique, ceiling(seq_along(deg_unique) / 50))
# crosswalk <- map_dfr(chunks, \(v) { out <- gemini_map(v); Sys.sleep(1); out })

#write_csv(crosswalk, "degree_crosswalk_api_raw.csv")
#print(crosswalk, n = 20)

Top10 Degree

crosswalk <- readr::read_csv(
  "degree_crosswalk_api_marked.csv", 
  show_col_types = FALSE
)

doctorates_std <- doctorates_clean |>
  dplyr::left_join(crosswalk, by = c("degree" = "original")) |>
  dplyr::mutate(
    standard_degree = dplyr::coalesce(standard, degree)
  )

# top 10 degree
top10_degree <- doctorates_std |>
  dplyr::count(standard_degree, sort = TRUE, name = "n") |>
  dplyr::slice_head(n = 10)

total_degrees <- dplyr::n_distinct(doctorates_std$standard_degree)

cat("Total unique degrees (standardized):", total_degrees, "\n\n")

## Total unique degrees (standardized): 313

print(top10_degree)

## # A tibble: 10 × 2
##    standard_degree                         n
##    <chr>                               <int>
##  1 Doctor of Philosophy              3338159
##  2 Doctor of Education                222885
##  3 Doctorate/Docteur                  128825
##  4 Doctor of Psychology                30118
##  5 Doctor of Medicine/Medical Doctor   27044
##  6 Doctor of Musical Arts              16629
##  7 Doctor of Business Administration   15707
##  8 Doctor of Ministry                  15219
##  9 Doctor of Engineering               11751
## 10 Doctor of Clinical Psychology        8115

degree_counts <- doctorates_std %>% 
  dplyr::count(standard_degree, name = "n") %>% 
  dplyr::arrange(desc(n))

write_csv(degree_counts, "all_standard_degrees_with_counts.csv")

# top 10 universities (total count)
top10_by_total <- doctorates_clean %>%
count(university, name = "total_papers") %>%
slice_max(total_papers, n = 10)
print(top10_by_total)

## # A tibble: 10 × 2
##    university                                    total_papers
##    <chr>                                                <int>
##  1 University of California, Berkeley                   48513
##  2 The University of Wisconsin - Madison                48180
##  3 University of Oxford (United Kingdom)                44764
##  4 The University of Manchester (United Kingdom)        44388
##  5 University of Illinois at Urbana-Champaign           41632
##  6 University of Michigan                               40521
##  7 The Ohio State University                            39845
##  8 Columbia University                                  38748
##  9 University of Minnesota                              38575
## 10 Harvard University                                   38574

# top 10 universities (average paper per year)
univ_avg_per_year <- doctorates_clean %>%
  group_by(university) %>%
  summarise(
    total_papers    = n(),
    active_years    = n_distinct(degreeyear),
    papers_per_year = total_papers / active_years
  ) %>%
  ungroup() %>%
  arrange(desc(papers_per_year))

# total school count
total_schools <- n_distinct(doctorates_clean$university)

# top 10
top10_by_avg <- univ_avg_per_year %>%
  slice_max(papers_per_year, n = 10)
print(top10_by_avg)

## # A tibble: 10 × 4
##    university                                    total_papers active_years
##    <chr>                                                <int>        <int>
##  1 University of California, Berkeley                   48513           73
##  2 The University of Wisconsin - Madison                48180           74
##  3 University of Oxford (United Kingdom)                44764           73
##  4 The University of Manchester (United Kingdom)        44388           73
##  5 University of Illinois at Urbana-Champaign           41632           71
##  6 Capella University                                   14577           26
##  7 University of Michigan                               40521           73
##  8 The Ohio State University                            39845           73
##  9 Harvard University                                   38574           73
## 10 Columbia University                                  38748           74
##    papers_per_year
##              <dbl>
##  1            665.
##  2            651.
##  3            613.
##  4            608.
##  5            586.
##  6            561.
##  7            555.
##  8            546.
##  9            528.
## 10            524.

cat("Total universities:", total_schools, "\n\n")

## Total universities: 2985

Univeristy Location & World Graph

doctorates_clean %>%
  distinct(university) %>%
  arrange(university) %>%
  write_csv("unique_universities.csv")

# Google Geocoding API
# Sys.setenv(GOOGLEGEOCODE_API_KEY = "AIzaSyCiqn3cISXHi85YxG5ycdSMD6g9tctlpQE")
# uni_raw <- read_csv("unique_universities.csv", col_names = "university")
# 
# geo_out <- uni_raw %>%
#   geocode(
#     address      = university,
#     method       = "google",
#     lat          = latitude,
#     long         = longitude,
#     full_results = TRUE,
#     limit        = 1
#   )
# 
# uni_coords_only <- geo_out %>%
#   mutate(
#     lat = round(latitude,  6),
#     lon = round(longitude, 6)
#   ) %>%
#   transmute(
#     university        = university,
#     lon,
#     lat,
#     address           = formatted_address
#   )
# 
# print(head(uni_coords_only, 20))
# write_csv(uni_coords_only, "universities_with_coords_and_address.csv")

# missing
uni_coords <- read_csv("universities_with_coords_and_address.csv")

## Rows: 2985 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

missing_summary <- uni_coords %>%
  summarise(
    total = n(),
    missing_lon      = sum(is.na(lon)),
    missing_lat      = sum(is.na(lat)),
    missing_both     = sum(is.na(lon) & is.na(lat)),
    pct_missing_lon  = missing_lon  / total,
    pct_missing_lat  = missing_lat  / total,
    pct_missing_both = missing_both / total
  )

print(missing_summary)

## # A tibble: 1 × 7
##   total missing_lon missing_lat missing_both pct_missing_lon pct_missing_lat
##   <int>       <int>       <int>        <int>           <dbl>           <dbl>
## 1  2985          74          74           74          0.0248          0.0248
##   pct_missing_both
##              <dbl>
## 1           0.0248

# hand-tagged dataset, deleted "None"
unis <- read_csv("universities_with_coords_marked.csv")

## Rows: 2983 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# check missing values
missing_summary2 <- unis %>%
  summarise(
    total_rows      = n(),
    missing_lon     = sum(is.na(lon)),
    missing_lat     = sum(is.na(lat)),
    missing_address = sum(is.na(address))
  )

print(missing_summary2)

## # A tibble: 1 × 4
##   total_rows missing_lon missing_lat missing_address
##        <int>       <int>       <int>           <int>
## 1       2983           0           0               0

Graph: number of universities and number of thesis

# summarize the number of papers by university
thesis_by_uni <- doctorates_clean %>%
  count(university, name = "thesis_n")

unis_counts <- unis %>%
  left_join(thesis_by_uni, by = "university") %>%
  mutate(thesis_n = tidyr::replace_na(thesis_n, 0))

# find country using lon and lat
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") %>%
  dplyr::select(country = admin)  

unis_sf <- st_as_sf(
  unis_counts,
  coords = c("lon", "lat"),
  crs    = 4326, remove = FALSE
)

unis_country <- st_join(unis_sf, world, join = st_within) %>%
  st_drop_geometry()

missing_init <- which(is.na(unis_country$country))
cat("Initial unmatched universities:", length(missing_init), "\n")

## Initial unmatched universities: 85

# manual check
if (length(missing_init) > 0) {
  write_csv(unis_country[missing_init, ], "unmatched_unis.csv")
}

manual_fix <- readr::read_csv("unmatched_unis_marked.csv", show_col_types = FALSE) %>% 
  dplyr::select(university, country)

unis_country <- unis_country %>% 
  dplyr::left_join(manual_fix, by = "university", suffix = c("", "_man")) %>% 
  dplyr::mutate(country = dplyr::coalesce(country, country_man)) %>% 
  dplyr::select(-country_man)

missing_left <- sum(is.na(unis_country$country))
cat("Still Missing：", missing_left, "\n")

## Still Missing： 0

country_sum <- unis_country %>%
  group_by(country) %>%
  summarise(
    lon          = mean(lon, na.rm = TRUE),
    lat          = mean(lat, na.rm = TRUE),
    univ_n       = n_distinct(university),
    thesis_total = sum(thesis_n),
    .groups      = "drop"
  ) %>%
  filter(!is.na(country))

bins_cty <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_cty  <- colorBin("Blues", domain = country_sum$thesis_total, bins = bins_cty)

leaflet(country_sum) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    ~lon, ~lat,
    radius      = ~pmax(3, sqrt(univ_n)),
    fillColor   = ~pal_cty(thesis_total),
    fillOpacity = 0.85, stroke = FALSE,
    popup = ~paste0(
      "<b>", country, "</b><br/>",
      "Universities: ", univ_n, "<br/>",
      "Doctoral theses: ", thesis_total
    )
  ) %>%
  addLegend(
    "bottomright", pal = pal_cty, values = ~thesis_total,
    title = "Doctoral theses (country)", opacity = 1
  )

Graph: number of thesis per universities

bins_uni <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_uni  <- colorBin("YlOrRd", domain = unis_counts$thesis_n, bins = bins_uni)

fixed_radius <- 6

leaflet(unis_counts) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(
    lng         = ~lon,
    lat         = ~lat,
    radius      = fixed_radius,         
    fillColor   = ~pal_uni(thesis_n),
    fillOpacity = 0.85, stroke = FALSE,
    popup = ~paste0(
      "<b>", university, "</b><br/>",
      "Doctoral theses: ", thesis_n
    )
  ) %>% 
  addLegend(
    "bottomright",
    pal     = pal_uni,
    values  = ~thesis_n,
    title   = "Doctoral theses per university",
    opacity = 1
  )

clean fields

fields_raw <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)

clean_terms <- function(x) {
  x %>% 
    str_replace_all("\\[|\\]", "") %>% 
    str_squish() %>% 
    str_to_sentence(locale = "en")
}

fields <- fields_raw %>% 
  mutate(across(any_of(c("classterms", "subjecterms", "papercat")), clean_terms)) %>% 
  mutate(across(where(is.character), ~ na_if(., ""))) 

uniq_class <- fields %>%
  separate_rows(classterms, sep = ";") %>%
  mutate(term = str_trim(classterms)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

uniq_subject <- fields %>%
  separate_rows(subjecterms, sep = ";") %>%
  mutate(term = str_trim(subjecterms)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

uniq_papercat <- fields %>%
  separate_rows(papercat, sep = ";") %>%
  mutate(term = str_trim(papercat)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

  
# export csv
write_csv(uniq_class,   "unique_classterms.csv")
write_csv(uniq_subject, "unique_subjecterms.csv")
write_csv(uniq_papercat,"unique_papercat.csv")

Stats for fields

# count unique term for each column 
term_counts <- tibble(
  field        = c("classterms", "subjecterms", "papercat"),
  unique_terms = c(
    nrow(uniq_class),
    nrow(uniq_subject),
    nrow(uniq_papercat)
  )
)
print(term_counts)

## # A tibble: 3 × 2
##   field       unique_terms
##   <chr>              <int>
## 1 classterms          1672
## 2 subjecterms        21503
## 3 papercat              40

total_rows <- nrow(fields)
cat("\nTotal rows in dataset:", total_rows, "\n")

## 
## Total rows in dataset: 3900085

# missing count for single column
single_missing <- fields %>% 
  summarise(
    miss_classterms   = sum(is.na(classterms)),
    miss_subjecterms  = sum(is.na(subjecterms)),
    miss_papercat     = sum(is.na(papercat))
  ) %>% 
  pivot_longer(everything(),
               names_to = "missing_type",
               values_to = "n") %>% 
  mutate(pct = n / total_rows)

print(single_missing)

## # A tibble: 3 × 3
##   missing_type          n    pct
##   <chr>             <int>  <dbl>
## 1 miss_classterms  430560 0.110 
## 2 miss_subjecterms 349546 0.0896
## 3 miss_papercat    463337 0.119

# missing count & percentage (mutual exclusion)
missing_tbl <- fields %>% 
  mutate(
    miss_class   = is.na(classterms),
    miss_subject = is.na(subjecterms),
    miss_paper   = is.na(papercat),
    
    missing_type = case_when(
      miss_class  & !miss_subject & !miss_paper  ~ "Only classterms",
      !miss_class &  miss_subject & !miss_paper  ~ "Only subjecterms",
      !miss_class & !miss_subject &  miss_paper  ~ "Only papercat",
      
      miss_class  &  miss_subject & !miss_paper  ~ "Classterms + subjecterms",
      miss_class  & !miss_subject &  miss_paper  ~ "Classterms + papercat",
      !miss_class &  miss_subject &  miss_paper  ~ "Subjecterms + papercat",
      
      miss_class  &  miss_subject &  miss_paper  ~ "All three",
      TRUE                                         ~ "None missing"         
    )
  ) %>% 
  filter(missing_type != "None missing") %>%  
  count(missing_type, name = "n") %>% 
  mutate(pct = n / total_rows) %>% 
  arrange(desc(n))

print(missing_tbl)

## # A tibble: 7 × 3
##   missing_type                  n      pct
##   <chr>                     <int>    <dbl>
## 1 Only papercat            370870 0.0951  
## 2 Classterms + subjecterms 302705 0.0776  
## 3 Classterms + papercat     46114 0.0118  
## 4 All three                 45820 0.0117  
## 5 Only classterms           35921 0.00921 
## 6 Subjecterms + papercat      533 0.000137
## 7 Only subjecterms            488 0.000125

Top 10 degree for each field (temporarily commented for saving vector memory)

# gemini_map <- function(vec){
#   prompt <- paste(
#     "You are a data-cleaning assistant.",
#     "Return ONLY a JSON array (no markdown fences).",
#     "Each object needs keys 'original' and 'standard'.",
#     "'standard' = full formal field name in English.",
#     "If the original is already formal, copy it.",
#     "\n\n",
#     paste0(seq_along(vec), ". ", vec, collapse = "\n")
#   )
#   body <- list(contents = list(list(parts = list(list(text = prompt)))))
#   
#   resp <- POST(
#     gemini_endpoint,
#     body = toJSON(body, auto_unbox = TRUE),
#     encode = "json",
#     add_headers(
#       "Content-Type"   = "application/json",
#       "X-goog-api-key" = Sys.getenv("GEMINI_API_KEY")
#     )
#   )
#   stop_for_status(resp)
#   
#   txt <- fromJSON(content(resp, "text", encoding = "UTF-8"),
#                   simplifyVector = FALSE)$candidates[[1]]$content$parts[[1]]$text
#   
#   json_str <- str_extract(txt, "(?s)\\[.*\\]") %>% str_trim()
#   as_tibble(fromJSON(json_str))
# }
# 
# ── 2) Build cross-walk from unique-list CSV ---------------
# make_crosswalk_from_file <- function(unique_csv, out_csv,
#                                      chunk_size = 100, pause = 0.3){
#   vec <- read_csv(unique_csv, show_col_types = FALSE) %>% pull(term)
#   message(basename(unique_csv), " : ", length(vec), " terms")
#   
#   # resume support
#   if (file.exists(out_csv)){
#     done <- read_csv(out_csv, show_col_types = FALSE)$original
#     vec  <- setdiff(vec, done)
#     message("   skipping ", length(done), " already done")
#   }
#   if (length(vec) == 0){
#     message("✓ no new terms; done\n")
#     return(invisible(read_csv(out_csv, show_col_types = FALSE)))
#   }
#   
#   chunks <- split(vec, ceiling(seq_along(vec)/chunk_size))
#   cw_new <- map_dfr(chunks, ~{res <- gemini_map(.x); Sys.sleep(pause); res})
#   
#   cw_all <- if (file.exists(out_csv))
#               bind_rows(read_csv(out_csv, show_col_types = FALSE), cw_new)
#             else cw_new
#   
#   write_csv(cw_all, out_csv)
#   cw_all
# }
# 
# cw_paper   <- make_crosswalk_from_file("unique_papercat.csv",    "crosswalk_papercat.csv")
# cw_class   <- make_crosswalk_from_file("unique_classterms.csv",  "crosswalk_classterms.csv")
# cw_subject <- make_crosswalk_from_file("unique_subjecterms.csv", "crosswalk_subjecterms.csv")

cw_class   <- read_csv("crosswalk_classterms.csv",   show_col_types = FALSE)
cw_subject <- read_csv("crosswalk_subjecterms.csv", show_col_types = FALSE)
cw_paper   <- read_csv("crosswalk_papercat.csv",    show_col_types = FALSE)


process_field <- function(data, column, cw){
  data %>% 
    separate_rows({{ column }}, sep = ";") %>%         
    mutate(term = str_trim({{ column }})) %>%           
    left_join(cw, by = c("term" = "original")) %>%      
    mutate(term_std = coalesce(standard, term)) %>% 
    filter(term_std != "")                           
}


df_class   <- process_field(fields, classterms,  cw_class)
df_subject <- process_field(fields, subjecterms, cw_subject)
df_paper   <- process_field(fields, papercat,    cw_paper)


term_counts <- tibble(
  field        = c("classterms", "subjecterms", "papercat"),
  unique_terms = c(
    n_distinct(df_class$term_std),
    n_distinct(df_subject$term_std),
    n_distinct(df_paper$term_std)
  )
)


top10_class   <- df_class   %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)
top10_subject <- df_subject %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)
top10_paper   <- df_paper   %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)


cat("\nUnique standardized term counts:\n")

## 
## Unique standardized term counts:

print(term_counts)

## # A tibble: 3 × 2
##   field       unique_terms
##   <chr>              <int>
## 1 classterms          1307
## 2 subjecterms        14556
## 3 papercat              28

cat("\nTop-10 classterms:\n");   print(top10_class,   n = 10)

## 
## Top-10 classterms:

## # A tibble: 10 × 2
##    term_std                    n
##    <chr>                   <int>
##  1 Electrical engineering 119262
##  2 Clinical Psychology    112818
##  3 Computer science       103107
##  4 Biochemistry            97833
##  5 Molecular biology       88636
##  6 Mathematics             80093
##  7 Organic chemistry       78727
##  8 Chemistry               77638
##  9 Mechanical engineering  74988
## 10 Psychology              65562

cat("\nTop-10 subjecterms:\n");  print(top10_subject, n = 10)

## 
## Top-10 subjecterms:

## # A tibble: 10 × 2
##    term_std                        n
##    <chr>                       <int>
##  1 Electrical engineering     119072
##  2 Clinical psychology        109087
##  3 Computer science           102982
##  4 Biochemistry                97732
##  5 Molecular biology           88645
##  6 Educational administration  84759
##  7 Mathematics                 80174
##  8 Chemistry                   79078
##  9 Psychotherapy               74903
## 10 Mechanical engineering      74873

cat("\nTop-10 papercat:\n");     print(top10_paper,   n = 10)

## 
## Top-10 papercat:

## # A tibble: 10 × 2
##    term_std                                  n
##    <chr>                                 <int>
##  1 Social sciences                      988716
##  2 Pure sciences                        531967
##  3 Applied sciences                     522924
##  4 Biological sciences                  479691
##  5 Education                            475342
##  6 Health and environmental sciences    321802
##  7 Psychology                           281954
##  8 Language, literature and linguistics 181760
##  9 Communication and the arts           137653
## 10 Philosophy, religion and theology    116360

cat("\nTotal rows in dataset:", nrow(fields), "\n")

## 
## Total rows in dataset: 3900085

df_class %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_classterms.csv")

df_subject %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_subjecterms.csv")

df_paper %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_papercat.csv")

Mapped for classterms and subjecterms

# fields <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
# 
# map_class_subject <- fields %>%
#   separate_rows(classterms, sep = ";") %>%
#   separate_rows(subjecterms, sep = ";") %>%
#   mutate(
#     classterms  = str_trim(classterms),
#     subjecterms = str_trim(subjecterms)
#   ) %>%
#   distinct(classterms, subjecterms) %>%
#   group_by(classterms) %>%
#   summarise(
#     n_subjects   = n_distinct(subjecterms),
#     subject_list = paste(unique(subjecterms), collapse = "; "),
#     .groups = "drop"
#   )
# 
# 
# write_csv(map_class_subject, "map_classterms_subjecterms.csv")
# 
# # basic stats
# quick_numeric_stats(map_class_subject, n_subjects)
# 
# # top 20 print
# top20_classes <- map_class_subject %>%
#   arrange(desc(n_subjects)) %>%
#   slice_head(n = 20)
# 
# print(top20_classes)
# 
# # histogram
# ggplot(map_class_subject, aes(x = n_subjects)) +
#   geom_histogram(binwidth = 5, fill = "steelblue", color = "steelblue") +
#   labs(
#     title = "Distribution of Number of Subjects per Class Term",
#     x = "Number of Subjects",
#     y = "Count of Class Terms"
#   )
# 
# # top20 
# ggplot(top20_classes, aes(x = reorder(classterms, n_subjects), y = n_subjects)) +
#   geom_col(fill = "steelblue") +
#   coord_flip() +
#   labs(
#     title = "Top 20 Class Terms by Number of Subject Terms",
#     x = "Class Term",
#     y = "Number of Subjects"
#   )

Doctorate-only dataset

Xinxiong Zhao

2025-09-08

Library

Descriptive Function

Import Raw Dataset

Filling the [None] degree using degree level

Identify the Doctorate

Publish date analysis

Degree year analysis

Degree vs Publish Year Analysis

Make crosswalk for degrees

Need adjust for new degree

Top10 Degree

Univeristy Location & World Graph

Graph: number of universities and number of thesis

Graph: number of thesis per universities

clean fields

Stats for fields

Top 10 degree for each field (temporarily commented for saving vector memory)

Mapped for classterms and subjecterms