knitr::opts_chunk$set(echo = TRUE)
options(tibble.width = Inf)

Library

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(tidygeocoder)
## Warning: package 'tidygeocoder' was built under R version 4.4.1
library(purrr)
## Warning: package 'purrr' was built under R version 4.4.1
library(leaflet)
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.4.1
library(httr) 
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(sf)
## Warning: package 'sf' was built under R version 4.4.1
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.1
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
Sys.setenv(GEMINI_API_KEY = "AIzaSyCxbDGILqfXqMyjQ6QbGZHdYUJC_NdNOFg")
gemini_endpoint <- "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"

Descriptive Function

quick_numeric_stats <- function(data, column) {
  vec <- data %>% pull({{ column }})
  if (!is.numeric(vec)) stop("Column must be numeric")

  vec_non_na <- vec[!is.na(vec)]

  tibble(
    variable = as_label(enquo(column)),
    total    = length(vec),
    missing  = sum(is.na(vec)),
    min      = min(vec_non_na),
    q1       = quantile(vec_non_na, 0.25),
    median   = median(vec_non_na),
    mean     = mean(vec_non_na),
    q3       = quantile(vec_non_na, 0.75),
    max      = max(vec_non_na),
    sd       = sd(vec_non_na)
  )
}

Import Raw Dataset

dissertations <- read_csv("dissertations-clean.csv", show_col_types = FALSE)
# print total rows of raw data
total_rows <- nrow(dissertations)
cat("Total rows: ", total_rows, "\n")
## Total rows:  5471940
# count all missing value for degree/degreelevel/for both
dissertations %>%  
  summarise(
    total = n(),
    
    missing_degree = sum(
      is.na(degree) |
      str_trim(degree) == "" |
      degree == "[None]"
    ),
    pct_missing_degree = missing_degree / total,
    
    missing_degreelevel = sum(
      is.na(degreelevel) |
      str_trim(degreelevel) == "" |
      degreelevel == "[None]"
    ),
    pct_missing_degreelevel = missing_degreelevel / total,
    
    missing_either = sum(
      (is.na(degree)      | str_trim(degree) == ""      | degree == "[None]") |
      (is.na(degreelevel) | str_trim(degreelevel) == "" | degreelevel == "[None]")
    ),
    pct_missing_either = missing_either / total
  ) %>%
  print()
## # A tibble: 1 × 7
##     total missing_degree pct_missing_degree missing_degreelevel
##     <int>          <int>              <dbl>               <int>
## 1 5471940          94968             0.0174                   0
##   pct_missing_degreelevel missing_either pct_missing_either
##                     <dbl>          <int>              <dbl>
## 1                       0          94968             0.0174

Filling the [None] degree using degree level

dissertations %>%
  filter(degree == "[None]") %>%
  distinct(degreelevel) %>%
  arrange(degreelevel) %>%
  write_csv("degreelevel_to_degree_mapping.csv")

mapping <- read_csv("degreelevel_to_degree_mapping_marked.csv")
## Rows: 117 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): degreelevel, mapped_degree
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dissertations_cleaned <- dissertations %>%
  left_join(mapping, by = "degreelevel") %>%
  mutate(
    degree = if_else(
      degree %in% c("[None]") & !is.na(mapped_degree),
      mapped_degree,
      degree
    )
  )

# Check how many still [None]
dissertations_cleaned %>%
  summarise(
    total = n(),
    still_none_degree = sum(degree == "[None]" | is.na(degree)),
    still_none_degreelevel = sum(degreelevel == "[None]" | is.na(degreelevel))
  )
## # A tibble: 1 × 3
##     total still_none_degree still_none_degreelevel
##     <int>             <int>                  <int>
## 1 5471940                 0                      0

Identify the Doctorate

# print all degree and degreelevel name to see possible keywords
dissertations_cleaned %>% 
  distinct(degree) %>% 
  arrange(degree) %>% 
  write_csv("all_degree_values_raw.csv")

dissertations_cleaned %>% 
  distinct(degreelevel) %>% 
  arrange(degreelevel) %>% 
  write_csv("all_degreelevel_values_raw.csv")

# import two hand-tagged keywords mapping files

deg_map <- read_csv(
  "all_degree_values_marked.csv",
  col_types = cols(
    degree      = col_character(),
    is_doc      = col_logical()
  )
) %>%
  rename(is_doctorate_deg = is_doc)

lvl_map <- read_csv(
  "all_degreelevel_values_marked.csv",
  col_types = cols(
    degreelevel      = col_character(),
    is_doc           = col_logical()
  )
) %>% 
  rename(is_doctorate_level = is_doc)
clean_text <- function(x) stringr::str_squish(stringr::str_replace_all(x, "\u00A0", " "))

deg_map_one <- deg_map %>%
  dplyr::mutate(degree = clean_text(degree)) %>%
  dplyr::group_by(degree) %>%
  dplyr::summarise(
    is_doctorate_deg = any(is_doctorate_deg, na.rm = TRUE),
    .groups = "drop"
  )

lvl_map_one <- lvl_map %>%
  dplyr::mutate(degreelevel = clean_text(degreelevel)) %>%
  dplyr::group_by(degreelevel) %>%
  dplyr::summarise(
    is_doctorate_level = any(is_doctorate_level, na.rm = TRUE),
    .groups = "drop"
  )

dissertations_flagged <- dissertations_cleaned %>%
  dplyr::mutate(
    degree      = clean_text(degree),
    degreelevel = clean_text(degreelevel)
  ) %>%
  dplyr::left_join(deg_map_one, by = "degree",      relationship = "many-to-one") %>%
  dplyr::left_join(lvl_map_one, by = "degreelevel", relationship = "many-to-one") %>%
  dplyr::mutate(
    is_doctorate_deg   = coalesce(is_doctorate_deg,   FALSE),
    is_doctorate_level = coalesce(is_doctorate_level, FALSE),
    by_degree_only     = is_doctorate_deg   & !is_doctorate_level,
    by_level_only      = !is_doctorate_deg &  is_doctorate_level,
    by_both            = is_doctorate_deg   &  is_doctorate_level,
    is_doctorate       = by_both
  )

doctorates <- dissertations_flagged %>%
  dplyr::filter(is_doctorate) %>%
  dplyr::distinct()
  
total_dissertations <- nrow(dissertations_cleaned) 
num_doctorates <- nrow(doctorates)
pct_doctorates <- num_doctorates / total_dissertations

coverage_stats <- dissertations_flagged %>%
  dplyr::filter(
    !is.na(degree),      stringr::str_trim(degree)      != "", degree      != "[None]",
    !is.na(degreelevel), stringr::str_trim(degreelevel) != "", degreelevel != "[None]"
  ) %>%
  dplyr::distinct() %>%
  dplyr::summarise(
    n_is_degree = sum(by_degree_only),
    n_is_level  = sum(by_level_only),
    n_both      = sum(by_both)
  )

cat("Original total rows:", total_dissertations, "\n")
## Original total rows: 5471940
cat("STRICT (both) doctorates:", num_doctorates, " / ", total_dissertations,
    " (", sprintf('%.2f%%', 100*pct_doctorates), ")\n\n", sep = "")
## STRICT (both) doctorates:3900167 / 5471940 (71.28%)
print(coverage_stats)
## # A tibble: 1 × 3
##   n_is_degree n_is_level  n_both
##         <int>      <int>   <int>
## 1           4         29 3900167
write_csv(doctorates, "dissertations-doctorates.csv")

# degree-only rows
degree_only_rows <- dissertations_flagged %>%
  filter(by_degree_only) %>%
  distinct()

write_csv(degree_only_rows, "doctorates-degree-only.csv")

# level-only rows
level_only_rows <- dissertations_flagged %>%
  filter(by_level_only) %>%
  distinct()

write_csv(level_only_rows, "doctorates-level-only.csv")
# after revision, all degree and degreelevel left are unmatched. ASK USE BOTH or OR!

Publish date analysis

# descriptive stats: No missing value
dc <- read_csv("dissertations-doctorates.csv", show_col_types = FALSE)

dc <- dc %>%
  mutate(
    publication_date = as.Date(publishedate),
    pub_year  = as.integer(format(publication_date, "%Y")),
    pub_month = as.integer(format(publication_date, "%m")),
    pub_day   = as.integer(format(publication_date, "%d")),
  )

pub_year_stats <- quick_numeric_stats(dc, pub_year) %>%
  mutate(missing_pct = round(100 * missing / total, 2))

print(pub_year_stats)
## # A tibble: 1 × 11
##   variable   total missing   min    q1 median  mean    q3   max    sd
##   <chr>      <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 pub_year 3900167       0  1950  1983   2000 1996.  2012  2023  17.9
##   missing_pct
##         <dbl>
## 1           0
# data validity test: All data are good
dc <- dc %>%
  mutate(
    invalid_month = !is.na(pub_month) & !(pub_month %in% 1:12),
    invalid_day   = !is.na(pub_day)   & !(pub_day   %in% 1:31),
    parse_fail    = is.na(publication_date) 
  )

date_validity <- dc %>%
  summarise(
    n_total           = n(),
    parse_fail_n      = sum(parse_fail),
    invalid_month_n   = sum(invalid_month),
    invalid_day_n     = sum(invalid_day)
  )
print(date_validity)
## # A tibble: 1 × 4
##   n_total parse_fail_n invalid_month_n invalid_day_n
##     <int>        <int>           <int>         <int>
## 1 3900167            0               0             0
# year distribution
ggplot(dc %>% filter(!is.na(pub_year)), aes(x = pub_year)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Dissertations by Publication Year",
       x = "Publication Year", y = "Count")

Degree year analysis

# Doctorate degree years should be four-digit numeric values (YYYY).  
doctorates <- doctorates %>%
  mutate(degreeyear = suppressWarnings(as.integer(degreeyear)))

# total 30 rows that are not 4 digits
invalid_rows <- doctorates %>%
  filter(is.na(degreeyear) | !str_detect(as.character(degreeyear), "^\\d{4}$"))

write_csv(invalid_rows, "degreeyear_invalid_rows.csv")

# valid 4 digits
doctorates_valid4 <- doctorates %>%
  filter(!is.na(degreeyear) & str_detect(as.character(degreeyear), "^\\d{4}$"))

# 1600-2025 distribution
print( quick_numeric_stats(doctorates_valid4, degreeyear) )
## # A tibble: 1 × 10
##   variable     total missing   min    q1 median  mean    q3   max    sd
##   <chr>        <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900137       0  1605  1983   2000 1996.  2012  2023  17.9
ggplot(doctorates_valid4, aes(degreeyear)) +
  geom_histogram(binwidth = 1, boundary = 0, closed = "left",
                 fill = "blue", colour = "grey") +
  scale_x_continuous(breaks = seq(1600, 2025, 25), limits = c(1600, 2025)) +
  labs(title = "Doctorate Degree Years (All 4-digit)", x = "Degree Year", y = "Count") +
  theme_minimal()

# <1950 distribution
doctorates_valid4 %>%
  filter(degreeyear < 1950) %>%
  summarise(n = n(), min_year = min(degreeyear), max_year = max(degreeyear)) %>%
  print()
## # A tibble: 1 × 3
##       n min_year max_year
##   <int>    <int>    <int>
## 1    52     1605     1949
# filter 1950–2023
doctorates_clean <- doctorates_valid4 %>%
  filter(degreeyear >= 1950, degreeyear <= 2023)

print( quick_numeric_stats(doctorates_clean, degreeyear) )
## # A tibble: 1 × 10
##   variable     total missing   min    q1 median  mean    q3   max    sd
##   <chr>        <int>   <int> <int> <dbl>  <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900085       0  1950  1983   2000 1996.  2012  2023  17.9
ggplot(doctorates_clean, aes(degreeyear)) +
  geom_histogram(binwidth = 5, boundary = 0, closed = "left",
                 fill = "blue", colour = "grey") +
  scale_x_continuous(breaks = seq(1950, 2025, 5), limits = c(1950, 2025)) +
  labs(title = "Doctorate Degree Years (1950–2023, 5-year bins)",
       x = "Degree Year", y = "Count") +
  theme_minimal()

write_csv(doctorates_clean, "doctorates_1950_2023.csv")

Degree vs Publish Year Analysis

doctorates_clean <- readr::read_csv("dissertations-doctorates.csv", show_col_types = FALSE)

year_min <- 1950; year_max <- 2023

dc <- doctorates_clean %>%
  mutate(
    pub_year = year(publishedate),
    degreeyear_int = suppressWarnings(as.integer(degreeyear))
  )

# coverage table
coverage <- dc %>%
  dplyr::summarise(
    n_total     = dplyr::n(),
    n_deg_valid = sum(stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
                      as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE),
    n_pub_valid = sum(!is.na(pub_year), na.rm = TRUE),
    n_both      = sum(!is.na(pub_year) &
                      stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
                      as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE)
  ) %>%
  dplyr::mutate(
    pct_deg_valid = n_deg_valid / n_total,
    pct_pub_valid = n_pub_valid / n_total,
    pct_both      = n_both      / n_total
  )
print(coverage)
## # A tibble: 1 × 7
##   n_total n_deg_valid n_pub_valid  n_both pct_deg_valid pct_pub_valid pct_both
##     <int>       <int>       <int>   <int>         <dbl>         <dbl>    <dbl>
## 1 3900167     3900085     3900167 3900085          1.00             1     1.00
readr::write_csv(coverage, "coverage_degree_vs_publish.csv")

# compute lag = publish year − degree year, usually publish year >= degree year
lag_df <- dc %>%
  dplyr::mutate(degreeyear = suppressWarnings(as.integer(degreeyear))) %>%
  dplyr::filter(
    !is.na(pub_year),
    !is.na(degreeyear),
    dplyr::between(degreeyear, year_min, year_max)
  ) %>%
  dplyr::mutate(year_diff = pub_year - degreeyear)

cat("Lag sample size (both present, degree ", year_min, "–", year_max, "): ",
    nrow(lag_df), "\n", sep = "")
## Lag sample size (both present, degree 1950–2023): 3900085
# basic stats
lag_num <- quick_numeric_stats(lag_df, year_diff)

lag_counts <- lag_df %>%
  dplyr::summarise(
    n_valid     = dplyr::n(),
    n_same_year = sum(year_diff == 0),
    n_before    = sum(year_diff < 0),
    n_after     = sum(year_diff > 0)
  ) %>%
  dplyr::mutate(
    share_same  = n_same_year / n_valid,
    share_before= n_before    / n_valid,
    share_after = n_after     / n_valid
  )

lag_summary <- dplyr::bind_cols(lag_num, lag_counts)
print(lag_summary)
## # A tibble: 1 × 17
##   variable    total missing   min    q1 median   mean    q3   max    sd n_valid
##   <chr>       <int>   <int> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
## 1 year_diff 3900085       0   -51     0      0 0.0150     0    61 0.626 3900085
##   n_same_year n_before n_after share_same share_before share_after
##         <int>    <int>   <int>      <dbl>        <dbl>       <dbl>
## 1     3887833     1842   10410      0.997     0.000472     0.00267
readr::write_csv(lag_summary, "lag_summary_and_shares.csv")

# histogram of lag
zoom_xlim <- c(-3, 5)  
ggplot(lag_df, aes(x = year_diff)) +
  geom_histogram(binwidth = 1, boundary = 0, closed = "left",
                 aes(y = after_stat(..count.. / sum(..count..)))) +
  coord_cartesian(xlim = zoom_xlim) +
  labs(title = paste0("Lag = publish year − degree year (zoom ", zoom_xlim[1], "…", zoom_xlim[2], ")"),
       x = "Years", y = "Share of dissertations") +
  theme_minimal()

# outliers (publish >5y after OR >1y before degree)
neg_outlier <- -1; pos_outlier <- 5 
outliers <- lag_df %>%
  dplyr::filter(year_diff < neg_outlier | year_diff > pos_outlier) %>%
  dplyr::arrange(dplyr::desc(abs(year_diff)))
readr::write_csv(outliers, "outliers_lag.csv")
cat("Outliers flagged: ", nrow(outliers),
    " (criteria: <", neg_outlier, " or >", pos_outlier, ")\n", sep = "")
## Outliers flagged: 2488 (criteria: <-1 or >5)

Make crosswalk for degrees

Need adjust for new degree

deg_unique <- doctorates_clean %>%            
  distinct(degree) %>% 
  pull()
cat("Unique degree variants:", length(deg_unique), "\n")
## Unique degree variants: 389
#doctorates_clean <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
# 
# gemini_map <- function(deg_vec) {
# 
# # prompt
#   prompt <- paste(
#     "You are a data-cleaning assistant.",
#     "OUTPUT must be ONLY a JSON array, no preface, no markdown fences.",
#     "Each object needs keys 'original' and 'standard'.",
#     "- 'standard' = full formal degree name (e.g. 'Doctor of Philosophy').",
#     "- Do NOT return abbreviations like PhD, EdD, DBA, etc.",
#     "- If the original is already formal, copy it to 'standard'.",
#     "- Keep the list order.",
#     "\n\n",
#     paste0(seq_along(deg_vec), ". ", deg_vec, collapse = "\n")
#   )
# 
#   body <- list(
#     contents = list(list(parts = list(list(text = prompt))))
#   )
# 
#   resp <- POST(
#     gemini_endpoint,
#     body   = toJSON(body, auto_unbox = TRUE),
#     encode = "json",
#     add_headers(
#       `Content-Type`   = "application/json",
#       `X-goog-api-key` = Sys.getenv("GEMINI_API_KEY")
#     )
#   )
#   stop_for_status(resp)
# 
#   txt <- fromJSON(content(resp, "text", encoding = "UTF-8"),
#                   simplifyVector = FALSE)$candidates[[1]]$content$parts[[1]]$text
# 
#   json_str <- txt |>
#     str_extract("(?s)\\[.*\\]") |>
#     str_trim()
# 
#   as_tibble(fromJSON(json_str))
# }

# chunks <- split(deg_unique, ceiling(seq_along(deg_unique) / 50))
# crosswalk <- map_dfr(chunks, \(v) { out <- gemini_map(v); Sys.sleep(1); out })

#write_csv(crosswalk, "degree_crosswalk_api_raw.csv")
#print(crosswalk, n = 20)

Top10 Degree

crosswalk <- readr::read_csv(
  "degree_crosswalk_api_marked.csv", 
  show_col_types = FALSE
)

doctorates_std <- doctorates_clean |>
  dplyr::left_join(crosswalk, by = c("degree" = "original")) |>
  dplyr::mutate(
    standard_degree = dplyr::coalesce(standard, degree)
  )

# top 10 degree
top10_degree <- doctorates_std |>
  dplyr::count(standard_degree, sort = TRUE, name = "n") |>
  dplyr::slice_head(n = 10)

total_degrees <- dplyr::n_distinct(doctorates_std$standard_degree)

cat("Total unique degrees (standardized):", total_degrees, "\n\n")
## Total unique degrees (standardized): 313
print(top10_degree)
## # A tibble: 10 × 2
##    standard_degree                         n
##    <chr>                               <int>
##  1 Doctor of Philosophy              3338159
##  2 Doctor of Education                222885
##  3 Doctorate/Docteur                  128825
##  4 Doctor of Psychology                30118
##  5 Doctor of Medicine/Medical Doctor   27044
##  6 Doctor of Musical Arts              16629
##  7 Doctor of Business Administration   15707
##  8 Doctor of Ministry                  15219
##  9 Doctor of Engineering               11751
## 10 Doctor of Clinical Psychology        8115
degree_counts <- doctorates_std %>% 
  dplyr::count(standard_degree, name = "n") %>% 
  dplyr::arrange(desc(n))

write_csv(degree_counts, "all_standard_degrees_with_counts.csv")
# top 10 universities (total count)
top10_by_total <- doctorates_clean %>%
count(university, name = "total_papers") %>%
slice_max(total_papers, n = 10)
print(top10_by_total)
## # A tibble: 10 × 2
##    university                                    total_papers
##    <chr>                                                <int>
##  1 University of California, Berkeley                   48513
##  2 The University of Wisconsin - Madison                48180
##  3 University of Oxford (United Kingdom)                44764
##  4 The University of Manchester (United Kingdom)        44388
##  5 University of Illinois at Urbana-Champaign           41632
##  6 University of Michigan                               40521
##  7 The Ohio State University                            39845
##  8 Columbia University                                  38748
##  9 University of Minnesota                              38575
## 10 Harvard University                                   38574
# top 10 universities (average paper per year)
univ_avg_per_year <- doctorates_clean %>%
  group_by(university) %>%
  summarise(
    total_papers    = n(),
    active_years    = n_distinct(degreeyear),
    papers_per_year = total_papers / active_years
  ) %>%
  ungroup() %>%
  arrange(desc(papers_per_year))

# total school count
total_schools <- n_distinct(doctorates_clean$university)

# top 10
top10_by_avg <- univ_avg_per_year %>%
  slice_max(papers_per_year, n = 10)
print(top10_by_avg)
## # A tibble: 10 × 4
##    university                                    total_papers active_years
##    <chr>                                                <int>        <int>
##  1 University of California, Berkeley                   48513           73
##  2 The University of Wisconsin - Madison                48180           74
##  3 University of Oxford (United Kingdom)                44764           73
##  4 The University of Manchester (United Kingdom)        44388           73
##  5 University of Illinois at Urbana-Champaign           41632           71
##  6 Capella University                                   14577           26
##  7 University of Michigan                               40521           73
##  8 The Ohio State University                            39845           73
##  9 Harvard University                                   38574           73
## 10 Columbia University                                  38748           74
##    papers_per_year
##              <dbl>
##  1            665.
##  2            651.
##  3            613.
##  4            608.
##  5            586.
##  6            561.
##  7            555.
##  8            546.
##  9            528.
## 10            524.
cat("Total universities:", total_schools, "\n\n")
## Total universities: 2985

Univeristy Location & World Graph

doctorates_clean %>%
  distinct(university) %>%
  arrange(university) %>%
  write_csv("unique_universities.csv")

# Google Geocoding API
# Sys.setenv(GOOGLEGEOCODE_API_KEY = "AIzaSyCiqn3cISXHi85YxG5ycdSMD6g9tctlpQE")
# uni_raw <- read_csv("unique_universities.csv", col_names = "university")
# 
# geo_out <- uni_raw %>%
#   geocode(
#     address      = university,
#     method       = "google",
#     lat          = latitude,
#     long         = longitude,
#     full_results = TRUE,
#     limit        = 1
#   )
# 
# uni_coords_only <- geo_out %>%
#   mutate(
#     lat = round(latitude,  6),
#     lon = round(longitude, 6)
#   ) %>%
#   transmute(
#     university        = university,
#     lon,
#     lat,
#     address           = formatted_address
#   )
# 
# print(head(uni_coords_only, 20))
# write_csv(uni_coords_only, "universities_with_coords_and_address.csv")

# missing
uni_coords <- read_csv("universities_with_coords_and_address.csv")
## Rows: 2985 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
missing_summary <- uni_coords %>%
  summarise(
    total = n(),
    missing_lon      = sum(is.na(lon)),
    missing_lat      = sum(is.na(lat)),
    missing_both     = sum(is.na(lon) & is.na(lat)),
    pct_missing_lon  = missing_lon  / total,
    pct_missing_lat  = missing_lat  / total,
    pct_missing_both = missing_both / total
  )

print(missing_summary)
## # A tibble: 1 × 7
##   total missing_lon missing_lat missing_both pct_missing_lon pct_missing_lat
##   <int>       <int>       <int>        <int>           <dbl>           <dbl>
## 1  2985          74          74           74          0.0248          0.0248
##   pct_missing_both
##              <dbl>
## 1           0.0248
# hand-tagged dataset, deleted "None"
unis <- read_csv("universities_with_coords_marked.csv")
## Rows: 2983 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check missing values
missing_summary2 <- unis %>%
  summarise(
    total_rows      = n(),
    missing_lon     = sum(is.na(lon)),
    missing_lat     = sum(is.na(lat)),
    missing_address = sum(is.na(address))
  )

print(missing_summary2) 
## # A tibble: 1 × 4
##   total_rows missing_lon missing_lat missing_address
##        <int>       <int>       <int>           <int>
## 1       2983           0           0               0

Graph: number of universities and number of thesis

# summarize the number of papers by university
thesis_by_uni <- doctorates_clean %>%
  count(university, name = "thesis_n")

unis_counts <- unis %>%
  left_join(thesis_by_uni, by = "university") %>%
  mutate(thesis_n = tidyr::replace_na(thesis_n, 0))

# find country using lon and lat
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") %>%
  dplyr::select(country = admin)  

unis_sf <- st_as_sf(
  unis_counts,
  coords = c("lon", "lat"),
  crs    = 4326, remove = FALSE
)

unis_country <- st_join(unis_sf, world, join = st_within) %>%
  st_drop_geometry()

missing_init <- which(is.na(unis_country$country))
cat("Initial unmatched universities:", length(missing_init), "\n")
## Initial unmatched universities: 85
# manual check
if (length(missing_init) > 0) {
  write_csv(unis_country[missing_init, ], "unmatched_unis.csv")
}

manual_fix <- readr::read_csv("unmatched_unis_marked.csv", show_col_types = FALSE) %>% 
  dplyr::select(university, country)

unis_country <- unis_country %>% 
  dplyr::left_join(manual_fix, by = "university", suffix = c("", "_man")) %>% 
  dplyr::mutate(country = dplyr::coalesce(country, country_man)) %>% 
  dplyr::select(-country_man)

missing_left <- sum(is.na(unis_country$country))
cat("Still Missing:", missing_left, "\n")
## Still Missing: 0
country_sum <- unis_country %>%
  group_by(country) %>%
  summarise(
    lon          = mean(lon, na.rm = TRUE),
    lat          = mean(lat, na.rm = TRUE),
    univ_n       = n_distinct(university),
    thesis_total = sum(thesis_n),
    .groups      = "drop"
  ) %>%
  filter(!is.na(country))

bins_cty <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_cty  <- colorBin("Blues", domain = country_sum$thesis_total, bins = bins_cty)

leaflet(country_sum) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    ~lon, ~lat,
    radius      = ~pmax(3, sqrt(univ_n)),
    fillColor   = ~pal_cty(thesis_total),
    fillOpacity = 0.85, stroke = FALSE,
    popup = ~paste0(
      "<b>", country, "</b><br/>",
      "Universities: ", univ_n, "<br/>",
      "Doctoral theses: ", thesis_total
    )
  ) %>%
  addLegend(
    "bottomright", pal = pal_cty, values = ~thesis_total,
    title = "Doctoral theses (country)", opacity = 1
  )

Graph: number of thesis per universities

bins_uni <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_uni  <- colorBin("YlOrRd", domain = unis_counts$thesis_n, bins = bins_uni)

fixed_radius <- 6

leaflet(unis_counts) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(
    lng         = ~lon,
    lat         = ~lat,
    radius      = fixed_radius,         
    fillColor   = ~pal_uni(thesis_n),
    fillOpacity = 0.85, stroke = FALSE,
    popup = ~paste0(
      "<b>", university, "</b><br/>",
      "Doctoral theses: ", thesis_n
    )
  ) %>% 
  addLegend(
    "bottomright",
    pal     = pal_uni,
    values  = ~thesis_n,
    title   = "Doctoral theses per university",
    opacity = 1
  )

clean fields

fields_raw <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)

clean_terms <- function(x) {
  x %>% 
    str_replace_all("\\[|\\]", "") %>% 
    str_squish() %>% 
    str_to_sentence(locale = "en")
}

fields <- fields_raw %>% 
  mutate(across(any_of(c("classterms", "subjecterms", "papercat")), clean_terms)) %>% 
  mutate(across(where(is.character), ~ na_if(., ""))) 

uniq_class <- fields %>%
  separate_rows(classterms, sep = ";") %>%
  mutate(term = str_trim(classterms)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

uniq_subject <- fields %>%
  separate_rows(subjecterms, sep = ";") %>%
  mutate(term = str_trim(subjecterms)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

uniq_papercat <- fields %>%
  separate_rows(papercat, sep = ";") %>%
  mutate(term = str_trim(papercat)) %>%
  filter(!is.na(term) & term != "") %>%
  distinct(term)

  
# export csv
write_csv(uniq_class,   "unique_classterms.csv")
write_csv(uniq_subject, "unique_subjecterms.csv")
write_csv(uniq_papercat,"unique_papercat.csv")

Stats for fields

# count unique term for each column 
term_counts <- tibble(
  field        = c("classterms", "subjecterms", "papercat"),
  unique_terms = c(
    nrow(uniq_class),
    nrow(uniq_subject),
    nrow(uniq_papercat)
  )
)
print(term_counts)
## # A tibble: 3 × 2
##   field       unique_terms
##   <chr>              <int>
## 1 classterms          1672
## 2 subjecterms        21503
## 3 papercat              40
total_rows <- nrow(fields)
cat("\nTotal rows in dataset:", total_rows, "\n")
## 
## Total rows in dataset: 3900085
# missing count for single column
single_missing <- fields %>% 
  summarise(
    miss_classterms   = sum(is.na(classterms)),
    miss_subjecterms  = sum(is.na(subjecterms)),
    miss_papercat     = sum(is.na(papercat))
  ) %>% 
  pivot_longer(everything(),
               names_to = "missing_type",
               values_to = "n") %>% 
  mutate(pct = n / total_rows)

print(single_missing)
## # A tibble: 3 × 3
##   missing_type          n    pct
##   <chr>             <int>  <dbl>
## 1 miss_classterms  430560 0.110 
## 2 miss_subjecterms 349546 0.0896
## 3 miss_papercat    463337 0.119
# missing count & percentage (mutual exclusion)
missing_tbl <- fields %>% 
  mutate(
    miss_class   = is.na(classterms),
    miss_subject = is.na(subjecterms),
    miss_paper   = is.na(papercat),
    
    missing_type = case_when(
      miss_class  & !miss_subject & !miss_paper  ~ "Only classterms",
      !miss_class &  miss_subject & !miss_paper  ~ "Only subjecterms",
      !miss_class & !miss_subject &  miss_paper  ~ "Only papercat",
      
      miss_class  &  miss_subject & !miss_paper  ~ "Classterms + subjecterms",
      miss_class  & !miss_subject &  miss_paper  ~ "Classterms + papercat",
      !miss_class &  miss_subject &  miss_paper  ~ "Subjecterms + papercat",
      
      miss_class  &  miss_subject &  miss_paper  ~ "All three",
      TRUE                                         ~ "None missing"         
    )
  ) %>% 
  filter(missing_type != "None missing") %>%  
  count(missing_type, name = "n") %>% 
  mutate(pct = n / total_rows) %>% 
  arrange(desc(n))

print(missing_tbl)
## # A tibble: 7 × 3
##   missing_type                  n      pct
##   <chr>                     <int>    <dbl>
## 1 Only papercat            370870 0.0951  
## 2 Classterms + subjecterms 302705 0.0776  
## 3 Classterms + papercat     46114 0.0118  
## 4 All three                 45820 0.0117  
## 5 Only classterms           35921 0.00921 
## 6 Subjecterms + papercat      533 0.000137
## 7 Only subjecterms            488 0.000125

Top 10 degree for each field (temporarily commented for saving vector memory)

# gemini_map <- function(vec){
#   prompt <- paste(
#     "You are a data-cleaning assistant.",
#     "Return ONLY a JSON array (no markdown fences).",
#     "Each object needs keys 'original' and 'standard'.",
#     "'standard' = full formal field name in English.",
#     "If the original is already formal, copy it.",
#     "\n\n",
#     paste0(seq_along(vec), ". ", vec, collapse = "\n")
#   )
#   body <- list(contents = list(list(parts = list(list(text = prompt)))))
#   
#   resp <- POST(
#     gemini_endpoint,
#     body = toJSON(body, auto_unbox = TRUE),
#     encode = "json",
#     add_headers(
#       "Content-Type"   = "application/json",
#       "X-goog-api-key" = Sys.getenv("GEMINI_API_KEY")
#     )
#   )
#   stop_for_status(resp)
#   
#   txt <- fromJSON(content(resp, "text", encoding = "UTF-8"),
#                   simplifyVector = FALSE)$candidates[[1]]$content$parts[[1]]$text
#   
#   json_str <- str_extract(txt, "(?s)\\[.*\\]") %>% str_trim()
#   as_tibble(fromJSON(json_str))
# }
# 
# ── 2) Build cross-walk from unique-list CSV ---------------
# make_crosswalk_from_file <- function(unique_csv, out_csv,
#                                      chunk_size = 100, pause = 0.3){
#   vec <- read_csv(unique_csv, show_col_types = FALSE) %>% pull(term)
#   message(basename(unique_csv), " : ", length(vec), " terms")
#   
#   # resume support
#   if (file.exists(out_csv)){
#     done <- read_csv(out_csv, show_col_types = FALSE)$original
#     vec  <- setdiff(vec, done)
#     message("   skipping ", length(done), " already done")
#   }
#   if (length(vec) == 0){
#     message("✓ no new terms; done\n")
#     return(invisible(read_csv(out_csv, show_col_types = FALSE)))
#   }
#   
#   chunks <- split(vec, ceiling(seq_along(vec)/chunk_size))
#   cw_new <- map_dfr(chunks, ~{res <- gemini_map(.x); Sys.sleep(pause); res})
#   
#   cw_all <- if (file.exists(out_csv))
#               bind_rows(read_csv(out_csv, show_col_types = FALSE), cw_new)
#             else cw_new
#   
#   write_csv(cw_all, out_csv)
#   cw_all
# }
# 
# cw_paper   <- make_crosswalk_from_file("unique_papercat.csv",    "crosswalk_papercat.csv")
# cw_class   <- make_crosswalk_from_file("unique_classterms.csv",  "crosswalk_classterms.csv")
# cw_subject <- make_crosswalk_from_file("unique_subjecterms.csv", "crosswalk_subjecterms.csv")

cw_class   <- read_csv("crosswalk_classterms.csv",   show_col_types = FALSE)
cw_subject <- read_csv("crosswalk_subjecterms.csv", show_col_types = FALSE)
cw_paper   <- read_csv("crosswalk_papercat.csv",    show_col_types = FALSE)


process_field <- function(data, column, cw){
  data %>% 
    separate_rows({{ column }}, sep = ";") %>%         
    mutate(term = str_trim({{ column }})) %>%           
    left_join(cw, by = c("term" = "original")) %>%      
    mutate(term_std = coalesce(standard, term)) %>% 
    filter(term_std != "")                           
}


df_class   <- process_field(fields, classterms,  cw_class)
df_subject <- process_field(fields, subjecterms, cw_subject)
df_paper   <- process_field(fields, papercat,    cw_paper)


term_counts <- tibble(
  field        = c("classterms", "subjecterms", "papercat"),
  unique_terms = c(
    n_distinct(df_class$term_std),
    n_distinct(df_subject$term_std),
    n_distinct(df_paper$term_std)
  )
)


top10_class   <- df_class   %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)
top10_subject <- df_subject %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)
top10_paper   <- df_paper   %>% count(term_std, sort = TRUE, name = "n") %>% slice_head(n = 10)


cat("\nUnique standardized term counts:\n")
## 
## Unique standardized term counts:
print(term_counts)
## # A tibble: 3 × 2
##   field       unique_terms
##   <chr>              <int>
## 1 classterms          1307
## 2 subjecterms        14556
## 3 papercat              28
cat("\nTop-10 classterms:\n");   print(top10_class,   n = 10)
## 
## Top-10 classterms:
## # A tibble: 10 × 2
##    term_std                    n
##    <chr>                   <int>
##  1 Electrical engineering 119262
##  2 Clinical Psychology    112818
##  3 Computer science       103107
##  4 Biochemistry            97833
##  5 Molecular biology       88636
##  6 Mathematics             80093
##  7 Organic chemistry       78727
##  8 Chemistry               77638
##  9 Mechanical engineering  74988
## 10 Psychology              65562
cat("\nTop-10 subjecterms:\n");  print(top10_subject, n = 10)
## 
## Top-10 subjecterms:
## # A tibble: 10 × 2
##    term_std                        n
##    <chr>                       <int>
##  1 Electrical engineering     119072
##  2 Clinical psychology        109087
##  3 Computer science           102982
##  4 Biochemistry                97732
##  5 Molecular biology           88645
##  6 Educational administration  84759
##  7 Mathematics                 80174
##  8 Chemistry                   79078
##  9 Psychotherapy               74903
## 10 Mechanical engineering      74873
cat("\nTop-10 papercat:\n");     print(top10_paper,   n = 10)
## 
## Top-10 papercat:
## # A tibble: 10 × 2
##    term_std                                  n
##    <chr>                                 <int>
##  1 Social sciences                      988716
##  2 Pure sciences                        531967
##  3 Applied sciences                     522924
##  4 Biological sciences                  479691
##  5 Education                            475342
##  6 Health and environmental sciences    321802
##  7 Psychology                           281954
##  8 Language, literature and linguistics 181760
##  9 Communication and the arts           137653
## 10 Philosophy, religion and theology    116360
cat("\nTotal rows in dataset:", nrow(fields), "\n")
## 
## Total rows in dataset: 3900085
df_class %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_classterms.csv")

df_subject %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_subjecterms.csv")

df_paper %>% 
  distinct(term_std) %>% 
  arrange(term_std) %>% 
  write_csv("unique_standardized_papercat.csv")

Mapped for classterms and subjecterms

# fields <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
# 
# map_class_subject <- fields %>%
#   separate_rows(classterms, sep = ";") %>%
#   separate_rows(subjecterms, sep = ";") %>%
#   mutate(
#     classterms  = str_trim(classterms),
#     subjecterms = str_trim(subjecterms)
#   ) %>%
#   distinct(classterms, subjecterms) %>%
#   group_by(classterms) %>%
#   summarise(
#     n_subjects   = n_distinct(subjecterms),
#     subject_list = paste(unique(subjecterms), collapse = "; "),
#     .groups = "drop"
#   )
# 
# 
# write_csv(map_class_subject, "map_classterms_subjecterms.csv")
# 
# # basic stats
# quick_numeric_stats(map_class_subject, n_subjects)
# 
# # top 20 print
# top20_classes <- map_class_subject %>%
#   arrange(desc(n_subjects)) %>%
#   slice_head(n = 20)
# 
# print(top20_classes)
# 
# # histogram
# ggplot(map_class_subject, aes(x = n_subjects)) +
#   geom_histogram(binwidth = 5, fill = "steelblue", color = "steelblue") +
#   labs(
#     title = "Distribution of Number of Subjects per Class Term",
#     x = "Number of Subjects",
#     y = "Count of Class Terms"
#   )
# 
# # top20 
# ggplot(top20_classes, aes(x = reorder(classterms, n_subjects), y = n_subjects)) +
#   geom_col(fill = "steelblue") +
#   coord_flip() +
#   labs(
#     title = "Top 20 Class Terms by Number of Subject Terms",
#     x = "Class Term",
#     y = "Number of Subjects"
#   )