knitr::opts_chunk$set(echo = TRUE)
options(tibble.width = Inf)
Library
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(tidygeocoder)
## Warning: package 'tidygeocoder' was built under R version 4.4.1
library(purrr)
## Warning: package 'purrr' was built under R version 4.4.1
library(leaflet)
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.4.1
library(httr)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(sf)
## Warning: package 'sf' was built under R version 4.4.1
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.1
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Sys.setenv(GEMINI_API_KEY = "AIzaSyCxbDGILqfXqMyjQ6QbGZHdYUJC_NdNOFg")
gemini_endpoint <- "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
Descriptive Function
quick_numeric_stats <- function(data, column) {
vec <- data %>% pull({{ column }})
if (!is.numeric(vec)) stop("Column must be numeric")
vec_non_na <- vec[!is.na(vec)]
tibble(
variable = as_label(enquo(column)),
total = length(vec),
missing = sum(is.na(vec)),
min = min(vec_non_na),
q1 = quantile(vec_non_na, 0.25),
median = median(vec_non_na),
mean = mean(vec_non_na),
q3 = quantile(vec_non_na, 0.75),
max = max(vec_non_na),
sd = sd(vec_non_na)
)
}
Import Raw Dataset
dissertations <- read_csv("dissertations-clean.csv", show_col_types = FALSE)
# print total rows of raw data
total_rows <- nrow(dissertations)
cat("Total rows: ", total_rows, "\n")
## Total rows: 5471940
# count all missing value for degree/degreelevel/for both
dissertations %>%
summarise(
total = n(),
missing_degree = sum(
is.na(degree) |
str_trim(degree) == "" |
degree == "[None]"
),
pct_missing_degree = missing_degree / total,
missing_degreelevel = sum(
is.na(degreelevel) |
str_trim(degreelevel) == "" |
degreelevel == "[None]"
),
pct_missing_degreelevel = missing_degreelevel / total,
missing_either = sum(
(is.na(degree) | str_trim(degree) == "" | degree == "[None]") |
(is.na(degreelevel) | str_trim(degreelevel) == "" | degreelevel == "[None]")
),
pct_missing_either = missing_either / total
) %>%
print()
## # A tibble: 1 × 7
## total missing_degree pct_missing_degree missing_degreelevel
## <int> <int> <dbl> <int>
## 1 5471940 94968 0.0174 0
## pct_missing_degreelevel missing_either pct_missing_either
## <dbl> <int> <dbl>
## 1 0 94968 0.0174
Filling the [None] degree using degree level
dissertations %>%
filter(degree == "[None]") %>%
distinct(degreelevel) %>%
arrange(degreelevel) %>%
write_csv("degreelevel_to_degree_mapping.csv")
mapping <- read_csv("degreelevel_to_degree_mapping_marked.csv")
## Rows: 117 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): degreelevel, mapped_degree
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dissertations_cleaned <- dissertations %>%
left_join(mapping, by = "degreelevel") %>%
mutate(
degree = if_else(
degree %in% c("[None]") & !is.na(mapped_degree),
mapped_degree,
degree
)
)
# Check how many still [None]
dissertations_cleaned %>%
summarise(
total = n(),
still_none_degree = sum(degree == "[None]" | is.na(degree)),
still_none_degreelevel = sum(degreelevel == "[None]" | is.na(degreelevel))
)
## # A tibble: 1 × 3
## total still_none_degree still_none_degreelevel
## <int> <int> <int>
## 1 5471940 0 0
Identify the Doctorate
# print all degree and degreelevel name to see possible keywords
dissertations_cleaned %>%
distinct(degree) %>%
arrange(degree) %>%
write_csv("all_degree_values_raw.csv")
dissertations_cleaned %>%
distinct(degreelevel) %>%
arrange(degreelevel) %>%
write_csv("all_degreelevel_values_raw.csv")
# import two hand-tagged keywords mapping files
deg_map <- read_csv(
"all_degree_values_marked.csv",
col_types = cols(
degree = col_character(),
is_doc = col_logical()
)
) %>%
rename(is_doctorate_deg = is_doc)
lvl_map <- read_csv(
"all_degreelevel_values_marked.csv",
col_types = cols(
degreelevel = col_character(),
is_doc = col_logical()
)
) %>%
rename(is_doctorate_level = is_doc)
clean_text <- function(x) stringr::str_squish(stringr::str_replace_all(x, "\u00A0", " "))
deg_map_one <- deg_map %>%
dplyr::mutate(degree = clean_text(degree)) %>%
dplyr::group_by(degree) %>%
dplyr::summarise(
is_doctorate_deg = any(is_doctorate_deg, na.rm = TRUE),
.groups = "drop"
)
lvl_map_one <- lvl_map %>%
dplyr::mutate(degreelevel = clean_text(degreelevel)) %>%
dplyr::group_by(degreelevel) %>%
dplyr::summarise(
is_doctorate_level = any(is_doctorate_level, na.rm = TRUE),
.groups = "drop"
)
dissertations_flagged <- dissertations_cleaned %>%
dplyr::mutate(
degree = clean_text(degree),
degreelevel = clean_text(degreelevel)
) %>%
dplyr::left_join(deg_map_one, by = "degree", relationship = "many-to-one") %>%
dplyr::left_join(lvl_map_one, by = "degreelevel", relationship = "many-to-one") %>%
dplyr::mutate(
is_doctorate_deg = coalesce(is_doctorate_deg, FALSE),
is_doctorate_level = coalesce(is_doctorate_level, FALSE),
by_degree_only = is_doctorate_deg & !is_doctorate_level,
by_level_only = !is_doctorate_deg & is_doctorate_level,
by_both = is_doctorate_deg & is_doctorate_level,
is_doctorate = by_both
)
doctorates <- dissertations_flagged %>%
dplyr::filter(is_doctorate) %>%
dplyr::distinct()
total_dissertations <- nrow(dissertations_cleaned)
num_doctorates <- nrow(doctorates)
pct_doctorates <- num_doctorates / total_dissertations
coverage_stats <- dissertations_flagged %>%
dplyr::filter(
!is.na(degree), stringr::str_trim(degree) != "", degree != "[None]",
!is.na(degreelevel), stringr::str_trim(degreelevel) != "", degreelevel != "[None]"
) %>%
dplyr::distinct() %>%
dplyr::summarise(
n_is_degree = sum(by_degree_only),
n_is_level = sum(by_level_only),
n_both = sum(by_both)
)
cat("Original total rows:", total_dissertations, "\n")
## Original total rows: 5471940
cat("STRICT (both) doctorates:", num_doctorates, " / ", total_dissertations,
" (", sprintf('%.2f%%', 100*pct_doctorates), ")\n\n", sep = "")
## STRICT (both) doctorates:3900167 / 5471940 (71.28%)
print(coverage_stats)
## # A tibble: 1 × 3
## n_is_degree n_is_level n_both
## <int> <int> <int>
## 1 4 29 3900167
write_csv(doctorates, "dissertations-doctorates.csv")
# degree-only rows
degree_only_rows <- dissertations_flagged %>%
filter(by_degree_only) %>%
distinct()
write_csv(degree_only_rows, "doctorates-degree-only.csv")
# level-only rows
level_only_rows <- dissertations_flagged %>%
filter(by_level_only) %>%
distinct()
write_csv(level_only_rows, "doctorates-level-only.csv")
# after revision, all degree and degreelevel left are unmatched. ASK USE BOTH or OR!
Publish date analysis
# descriptive stats: No missing value
dc <- read_csv("dissertations-doctorates.csv", show_col_types = FALSE)
dc <- dc %>%
mutate(
publication_date = as.Date(publishedate),
pub_year = as.integer(format(publication_date, "%Y")),
pub_month = as.integer(format(publication_date, "%m")),
pub_day = as.integer(format(publication_date, "%d")),
)
pub_year_stats <- quick_numeric_stats(dc, pub_year) %>%
mutate(missing_pct = round(100 * missing / total, 2))
print(pub_year_stats)
## # A tibble: 1 × 11
## variable total missing min q1 median mean q3 max sd
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <int> <dbl>
## 1 pub_year 3900167 0 1950 1983 2000 1996. 2012 2023 17.9
## missing_pct
## <dbl>
## 1 0
# data validity test: All data are good
dc <- dc %>%
mutate(
invalid_month = !is.na(pub_month) & !(pub_month %in% 1:12),
invalid_day = !is.na(pub_day) & !(pub_day %in% 1:31),
parse_fail = is.na(publication_date)
)
date_validity <- dc %>%
summarise(
n_total = n(),
parse_fail_n = sum(parse_fail),
invalid_month_n = sum(invalid_month),
invalid_day_n = sum(invalid_day)
)
print(date_validity)
## # A tibble: 1 × 4
## n_total parse_fail_n invalid_month_n invalid_day_n
## <int> <int> <int> <int>
## 1 3900167 0 0 0
# year distribution
ggplot(dc %>% filter(!is.na(pub_year)), aes(x = pub_year)) +
geom_histogram(binwidth = 1) +
labs(title = "Dissertations by Publication Year",
x = "Publication Year", y = "Count")

Degree year analysis
# Doctorate degree years should be four-digit numeric values (YYYY).
doctorates <- doctorates %>%
mutate(degreeyear = suppressWarnings(as.integer(degreeyear)))
# total 30 rows that are not 4 digits
invalid_rows <- doctorates %>%
filter(is.na(degreeyear) | !str_detect(as.character(degreeyear), "^\\d{4}$"))
write_csv(invalid_rows, "degreeyear_invalid_rows.csv")
# valid 4 digits
doctorates_valid4 <- doctorates %>%
filter(!is.na(degreeyear) & str_detect(as.character(degreeyear), "^\\d{4}$"))
# 1600-2025 distribution
print( quick_numeric_stats(doctorates_valid4, degreeyear) )
## # A tibble: 1 × 10
## variable total missing min q1 median mean q3 max sd
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900137 0 1605 1983 2000 1996. 2012 2023 17.9
ggplot(doctorates_valid4, aes(degreeyear)) +
geom_histogram(binwidth = 1, boundary = 0, closed = "left",
fill = "blue", colour = "grey") +
scale_x_continuous(breaks = seq(1600, 2025, 25), limits = c(1600, 2025)) +
labs(title = "Doctorate Degree Years (All 4-digit)", x = "Degree Year", y = "Count") +
theme_minimal()

# <1950 distribution
doctorates_valid4 %>%
filter(degreeyear < 1950) %>%
summarise(n = n(), min_year = min(degreeyear), max_year = max(degreeyear)) %>%
print()
## # A tibble: 1 × 3
## n min_year max_year
## <int> <int> <int>
## 1 52 1605 1949
# filter 1950–2023
doctorates_clean <- doctorates_valid4 %>%
filter(degreeyear >= 1950, degreeyear <= 2023)
print( quick_numeric_stats(doctorates_clean, degreeyear) )
## # A tibble: 1 × 10
## variable total missing min q1 median mean q3 max sd
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <int> <dbl>
## 1 degreeyear 3900085 0 1950 1983 2000 1996. 2012 2023 17.9
ggplot(doctorates_clean, aes(degreeyear)) +
geom_histogram(binwidth = 5, boundary = 0, closed = "left",
fill = "blue", colour = "grey") +
scale_x_continuous(breaks = seq(1950, 2025, 5), limits = c(1950, 2025)) +
labs(title = "Doctorate Degree Years (1950–2023, 5-year bins)",
x = "Degree Year", y = "Count") +
theme_minimal()

write_csv(doctorates_clean, "doctorates_1950_2023.csv")
Degree vs Publish Year Analysis
doctorates_clean <- readr::read_csv("dissertations-doctorates.csv", show_col_types = FALSE)
year_min <- 1950; year_max <- 2023
dc <- doctorates_clean %>%
mutate(
pub_year = year(publishedate),
degreeyear_int = suppressWarnings(as.integer(degreeyear))
)
# coverage table
coverage <- dc %>%
dplyr::summarise(
n_total = dplyr::n(),
n_deg_valid = sum(stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE),
n_pub_valid = sum(!is.na(pub_year), na.rm = TRUE),
n_both = sum(!is.na(pub_year) &
stringr::str_detect(as.character(degreeyear), "^\\d{4}$") &
as.integer(degreeyear) >= year_min & as.integer(degreeyear) <= year_max, na.rm = TRUE)
) %>%
dplyr::mutate(
pct_deg_valid = n_deg_valid / n_total,
pct_pub_valid = n_pub_valid / n_total,
pct_both = n_both / n_total
)
print(coverage)
## # A tibble: 1 × 7
## n_total n_deg_valid n_pub_valid n_both pct_deg_valid pct_pub_valid pct_both
## <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 3900167 3900085 3900167 3900085 1.00 1 1.00
readr::write_csv(coverage, "coverage_degree_vs_publish.csv")
# compute lag = publish year − degree year, usually publish year >= degree year
lag_df <- dc %>%
dplyr::mutate(degreeyear = suppressWarnings(as.integer(degreeyear))) %>%
dplyr::filter(
!is.na(pub_year),
!is.na(degreeyear),
dplyr::between(degreeyear, year_min, year_max)
) %>%
dplyr::mutate(year_diff = pub_year - degreeyear)
cat("Lag sample size (both present, degree ", year_min, "–", year_max, "): ",
nrow(lag_df), "\n", sep = "")
## Lag sample size (both present, degree 1950–2023): 3900085
# basic stats
lag_num <- quick_numeric_stats(lag_df, year_diff)
lag_counts <- lag_df %>%
dplyr::summarise(
n_valid = dplyr::n(),
n_same_year = sum(year_diff == 0),
n_before = sum(year_diff < 0),
n_after = sum(year_diff > 0)
) %>%
dplyr::mutate(
share_same = n_same_year / n_valid,
share_before= n_before / n_valid,
share_after = n_after / n_valid
)
lag_summary <- dplyr::bind_cols(lag_num, lag_counts)
print(lag_summary)
## # A tibble: 1 × 17
## variable total missing min q1 median mean q3 max sd n_valid
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 year_diff 3900085 0 -51 0 0 0.0150 0 61 0.626 3900085
## n_same_year n_before n_after share_same share_before share_after
## <int> <int> <int> <dbl> <dbl> <dbl>
## 1 3887833 1842 10410 0.997 0.000472 0.00267
readr::write_csv(lag_summary, "lag_summary_and_shares.csv")
# histogram of lag
zoom_xlim <- c(-3, 5)
ggplot(lag_df, aes(x = year_diff)) +
geom_histogram(binwidth = 1, boundary = 0, closed = "left",
aes(y = after_stat(..count.. / sum(..count..)))) +
coord_cartesian(xlim = zoom_xlim) +
labs(title = paste0("Lag = publish year − degree year (zoom ", zoom_xlim[1], "…", zoom_xlim[2], ")"),
x = "Years", y = "Share of dissertations") +
theme_minimal()

# outliers (publish >5y after OR >1y before degree)
neg_outlier <- -1; pos_outlier <- 5
outliers <- lag_df %>%
dplyr::filter(year_diff < neg_outlier | year_diff > pos_outlier) %>%
dplyr::arrange(dplyr::desc(abs(year_diff)))
readr::write_csv(outliers, "outliers_lag.csv")
cat("Outliers flagged: ", nrow(outliers),
" (criteria: <", neg_outlier, " or >", pos_outlier, ")\n", sep = "")
## Outliers flagged: 2488 (criteria: <-1 or >5)
Make crosswalk for degrees
Need adjust for new degree
deg_unique <- doctorates_clean %>%
distinct(degree) %>%
pull()
cat("Unique degree variants:", length(deg_unique), "\n")
## Unique degree variants: 389
#doctorates_clean <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
#
# gemini_map <- function(deg_vec) {
#
# # prompt
# prompt <- paste(
# "You are a data-cleaning assistant.",
# "OUTPUT must be ONLY a JSON array, no preface, no markdown fences.",
# "Each object needs keys 'original' and 'standard'.",
# "- 'standard' = full formal degree name (e.g. 'Doctor of Philosophy').",
# "- Do NOT return abbreviations like PhD, EdD, DBA, etc.",
# "- If the original is already formal, copy it to 'standard'.",
# "- Keep the list order.",
# "\n\n",
# paste0(seq_along(deg_vec), ". ", deg_vec, collapse = "\n")
# )
#
# body <- list(
# contents = list(list(parts = list(list(text = prompt))))
# )
#
# resp <- POST(
# gemini_endpoint,
# body = toJSON(body, auto_unbox = TRUE),
# encode = "json",
# add_headers(
# `Content-Type` = "application/json",
# `X-goog-api-key` = Sys.getenv("GEMINI_API_KEY")
# )
# )
# stop_for_status(resp)
#
# txt <- fromJSON(content(resp, "text", encoding = "UTF-8"),
# simplifyVector = FALSE)$candidates[[1]]$content$parts[[1]]$text
#
# json_str <- txt |>
# str_extract("(?s)\\[.*\\]") |>
# str_trim()
#
# as_tibble(fromJSON(json_str))
# }
# chunks <- split(deg_unique, ceiling(seq_along(deg_unique) / 50))
# crosswalk <- map_dfr(chunks, \(v) { out <- gemini_map(v); Sys.sleep(1); out })
#write_csv(crosswalk, "degree_crosswalk_api_raw.csv")
#print(crosswalk, n = 20)
Top10 Degree
crosswalk <- readr::read_csv(
"degree_crosswalk_api_marked.csv",
show_col_types = FALSE
)
doctorates_std <- doctorates_clean |>
dplyr::left_join(crosswalk, by = c("degree" = "original")) |>
dplyr::mutate(
standard_degree = dplyr::coalesce(standard, degree)
)
# top 10 degree
top10_degree <- doctorates_std |>
dplyr::count(standard_degree, sort = TRUE, name = "n") |>
dplyr::slice_head(n = 10)
total_degrees <- dplyr::n_distinct(doctorates_std$standard_degree)
cat("Total unique degrees (standardized):", total_degrees, "\n\n")
## Total unique degrees (standardized): 313
print(top10_degree)
## # A tibble: 10 × 2
## standard_degree n
## <chr> <int>
## 1 Doctor of Philosophy 3338159
## 2 Doctor of Education 222885
## 3 Doctorate/Docteur 128825
## 4 Doctor of Psychology 30118
## 5 Doctor of Medicine/Medical Doctor 27044
## 6 Doctor of Musical Arts 16629
## 7 Doctor of Business Administration 15707
## 8 Doctor of Ministry 15219
## 9 Doctor of Engineering 11751
## 10 Doctor of Clinical Psychology 8115
degree_counts <- doctorates_std %>%
dplyr::count(standard_degree, name = "n") %>%
dplyr::arrange(desc(n))
write_csv(degree_counts, "all_standard_degrees_with_counts.csv")
# top 10 universities (total count)
top10_by_total <- doctorates_clean %>%
count(university, name = "total_papers") %>%
slice_max(total_papers, n = 10)
print(top10_by_total)
## # A tibble: 10 × 2
## university total_papers
## <chr> <int>
## 1 University of California, Berkeley 48513
## 2 The University of Wisconsin - Madison 48180
## 3 University of Oxford (United Kingdom) 44764
## 4 The University of Manchester (United Kingdom) 44388
## 5 University of Illinois at Urbana-Champaign 41632
## 6 University of Michigan 40521
## 7 The Ohio State University 39845
## 8 Columbia University 38748
## 9 University of Minnesota 38575
## 10 Harvard University 38574
# top 10 universities (average paper per year)
univ_avg_per_year <- doctorates_clean %>%
group_by(university) %>%
summarise(
total_papers = n(),
active_years = n_distinct(degreeyear),
papers_per_year = total_papers / active_years
) %>%
ungroup() %>%
arrange(desc(papers_per_year))
# total school count
total_schools <- n_distinct(doctorates_clean$university)
# top 10
top10_by_avg <- univ_avg_per_year %>%
slice_max(papers_per_year, n = 10)
print(top10_by_avg)
## # A tibble: 10 × 4
## university total_papers active_years
## <chr> <int> <int>
## 1 University of California, Berkeley 48513 73
## 2 The University of Wisconsin - Madison 48180 74
## 3 University of Oxford (United Kingdom) 44764 73
## 4 The University of Manchester (United Kingdom) 44388 73
## 5 University of Illinois at Urbana-Champaign 41632 71
## 6 Capella University 14577 26
## 7 University of Michigan 40521 73
## 8 The Ohio State University 39845 73
## 9 Harvard University 38574 73
## 10 Columbia University 38748 74
## papers_per_year
## <dbl>
## 1 665.
## 2 651.
## 3 613.
## 4 608.
## 5 586.
## 6 561.
## 7 555.
## 8 546.
## 9 528.
## 10 524.
cat("Total universities:", total_schools, "\n\n")
## Total universities: 2985
Univeristy Location & World Graph
doctorates_clean %>%
distinct(university) %>%
arrange(university) %>%
write_csv("unique_universities.csv")
# Google Geocoding API
# Sys.setenv(GOOGLEGEOCODE_API_KEY = "AIzaSyCiqn3cISXHi85YxG5ycdSMD6g9tctlpQE")
# uni_raw <- read_csv("unique_universities.csv", col_names = "university")
#
# geo_out <- uni_raw %>%
# geocode(
# address = university,
# method = "google",
# lat = latitude,
# long = longitude,
# full_results = TRUE,
# limit = 1
# )
#
# uni_coords_only <- geo_out %>%
# mutate(
# lat = round(latitude, 6),
# lon = round(longitude, 6)
# ) %>%
# transmute(
# university = university,
# lon,
# lat,
# address = formatted_address
# )
#
# print(head(uni_coords_only, 20))
# write_csv(uni_coords_only, "universities_with_coords_and_address.csv")
# missing
uni_coords <- read_csv("universities_with_coords_and_address.csv")
## Rows: 2985 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
missing_summary <- uni_coords %>%
summarise(
total = n(),
missing_lon = sum(is.na(lon)),
missing_lat = sum(is.na(lat)),
missing_both = sum(is.na(lon) & is.na(lat)),
pct_missing_lon = missing_lon / total,
pct_missing_lat = missing_lat / total,
pct_missing_both = missing_both / total
)
print(missing_summary)
## # A tibble: 1 × 7
## total missing_lon missing_lat missing_both pct_missing_lon pct_missing_lat
## <int> <int> <int> <int> <dbl> <dbl>
## 1 2985 74 74 74 0.0248 0.0248
## pct_missing_both
## <dbl>
## 1 0.0248
# hand-tagged dataset, deleted "None"
unis <- read_csv("universities_with_coords_marked.csv")
## Rows: 2983 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university, address
## dbl (2): lon, lat
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check missing values
missing_summary2 <- unis %>%
summarise(
total_rows = n(),
missing_lon = sum(is.na(lon)),
missing_lat = sum(is.na(lat)),
missing_address = sum(is.na(address))
)
print(missing_summary2)
## # A tibble: 1 × 4
## total_rows missing_lon missing_lat missing_address
## <int> <int> <int> <int>
## 1 2983 0 0 0
Graph: number of universities and number of thesis
# summarize the number of papers by university
thesis_by_uni <- doctorates_clean %>%
count(university, name = "thesis_n")
unis_counts <- unis %>%
left_join(thesis_by_uni, by = "university") %>%
mutate(thesis_n = tidyr::replace_na(thesis_n, 0))
# find country using lon and lat
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") %>%
dplyr::select(country = admin)
unis_sf <- st_as_sf(
unis_counts,
coords = c("lon", "lat"),
crs = 4326, remove = FALSE
)
unis_country <- st_join(unis_sf, world, join = st_within) %>%
st_drop_geometry()
missing_init <- which(is.na(unis_country$country))
cat("Initial unmatched universities:", length(missing_init), "\n")
## Initial unmatched universities: 85
# manual check
if (length(missing_init) > 0) {
write_csv(unis_country[missing_init, ], "unmatched_unis.csv")
}
manual_fix <- readr::read_csv("unmatched_unis_marked.csv", show_col_types = FALSE) %>%
dplyr::select(university, country)
unis_country <- unis_country %>%
dplyr::left_join(manual_fix, by = "university", suffix = c("", "_man")) %>%
dplyr::mutate(country = dplyr::coalesce(country, country_man)) %>%
dplyr::select(-country_man)
missing_left <- sum(is.na(unis_country$country))
cat("Still Missing:", missing_left, "\n")
## Still Missing: 0
country_sum <- unis_country %>%
group_by(country) %>%
summarise(
lon = mean(lon, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
univ_n = n_distinct(university),
thesis_total = sum(thesis_n),
.groups = "drop"
) %>%
filter(!is.na(country))
bins_cty <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_cty <- colorBin("Blues", domain = country_sum$thesis_total, bins = bins_cty)
leaflet(country_sum) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
~lon, ~lat,
radius = ~pmax(3, sqrt(univ_n)),
fillColor = ~pal_cty(thesis_total),
fillOpacity = 0.85, stroke = FALSE,
popup = ~paste0(
"<b>", country, "</b><br/>",
"Universities: ", univ_n, "<br/>",
"Doctoral theses: ", thesis_total
)
) %>%
addLegend(
"bottomright", pal = pal_cty, values = ~thesis_total,
title = "Doctoral theses (country)", opacity = 1
)
Graph: number of thesis per universities
bins_uni <- c(0, 500, 1000, 5000, 10000, 30000, Inf)
pal_uni <- colorBin("YlOrRd", domain = unis_counts$thesis_n, bins = bins_uni)
fixed_radius <- 6
leaflet(unis_counts) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng = ~lon,
lat = ~lat,
radius = fixed_radius,
fillColor = ~pal_uni(thesis_n),
fillOpacity = 0.85, stroke = FALSE,
popup = ~paste0(
"<b>", university, "</b><br/>",
"Doctoral theses: ", thesis_n
)
) %>%
addLegend(
"bottomright",
pal = pal_uni,
values = ~thesis_n,
title = "Doctoral theses per university",
opacity = 1
)
clean fields
fields_raw <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
clean_terms <- function(x) {
x %>%
str_replace_all("\\[|\\]", "") %>%
str_squish() %>%
str_to_sentence(locale = "en")
}
fields <- fields_raw %>%
mutate(across(any_of(c("classterms", "subjecterms", "papercat")), clean_terms)) %>%
mutate(across(where(is.character), ~ na_if(., "")))
uniq_class <- fields %>%
separate_rows(classterms, sep = ";") %>%
mutate(term = str_trim(classterms)) %>%
filter(!is.na(term) & term != "") %>%
distinct(term)
uniq_subject <- fields %>%
separate_rows(subjecterms, sep = ";") %>%
mutate(term = str_trim(subjecterms)) %>%
filter(!is.na(term) & term != "") %>%
distinct(term)
uniq_papercat <- fields %>%
separate_rows(papercat, sep = ";") %>%
mutate(term = str_trim(papercat)) %>%
filter(!is.na(term) & term != "") %>%
distinct(term)
# export csv
write_csv(uniq_class, "unique_classterms.csv")
write_csv(uniq_subject, "unique_subjecterms.csv")
write_csv(uniq_papercat,"unique_papercat.csv")
Stats for fields
# count unique term for each column
term_counts <- tibble(
field = c("classterms", "subjecterms", "papercat"),
unique_terms = c(
nrow(uniq_class),
nrow(uniq_subject),
nrow(uniq_papercat)
)
)
print(term_counts)
## # A tibble: 3 × 2
## field unique_terms
## <chr> <int>
## 1 classterms 1672
## 2 subjecterms 21503
## 3 papercat 40
total_rows <- nrow(fields)
cat("\nTotal rows in dataset:", total_rows, "\n")
##
## Total rows in dataset: 3900085
# missing count for single column
single_missing <- fields %>%
summarise(
miss_classterms = sum(is.na(classterms)),
miss_subjecterms = sum(is.na(subjecterms)),
miss_papercat = sum(is.na(papercat))
) %>%
pivot_longer(everything(),
names_to = "missing_type",
values_to = "n") %>%
mutate(pct = n / total_rows)
print(single_missing)
## # A tibble: 3 × 3
## missing_type n pct
## <chr> <int> <dbl>
## 1 miss_classterms 430560 0.110
## 2 miss_subjecterms 349546 0.0896
## 3 miss_papercat 463337 0.119
# missing count & percentage (mutual exclusion)
missing_tbl <- fields %>%
mutate(
miss_class = is.na(classterms),
miss_subject = is.na(subjecterms),
miss_paper = is.na(papercat),
missing_type = case_when(
miss_class & !miss_subject & !miss_paper ~ "Only classterms",
!miss_class & miss_subject & !miss_paper ~ "Only subjecterms",
!miss_class & !miss_subject & miss_paper ~ "Only papercat",
miss_class & miss_subject & !miss_paper ~ "Classterms + subjecterms",
miss_class & !miss_subject & miss_paper ~ "Classterms + papercat",
!miss_class & miss_subject & miss_paper ~ "Subjecterms + papercat",
miss_class & miss_subject & miss_paper ~ "All three",
TRUE ~ "None missing"
)
) %>%
filter(missing_type != "None missing") %>%
count(missing_type, name = "n") %>%
mutate(pct = n / total_rows) %>%
arrange(desc(n))
print(missing_tbl)
## # A tibble: 7 × 3
## missing_type n pct
## <chr> <int> <dbl>
## 1 Only papercat 370870 0.0951
## 2 Classterms + subjecterms 302705 0.0776
## 3 Classterms + papercat 46114 0.0118
## 4 All three 45820 0.0117
## 5 Only classterms 35921 0.00921
## 6 Subjecterms + papercat 533 0.000137
## 7 Only subjecterms 488 0.000125
Mapped for classterms and subjecterms
# fields <- read_csv("doctorates_1950_2023.csv", show_col_types = FALSE)
#
# map_class_subject <- fields %>%
# separate_rows(classterms, sep = ";") %>%
# separate_rows(subjecterms, sep = ";") %>%
# mutate(
# classterms = str_trim(classterms),
# subjecterms = str_trim(subjecterms)
# ) %>%
# distinct(classterms, subjecterms) %>%
# group_by(classterms) %>%
# summarise(
# n_subjects = n_distinct(subjecterms),
# subject_list = paste(unique(subjecterms), collapse = "; "),
# .groups = "drop"
# )
#
#
# write_csv(map_class_subject, "map_classterms_subjecterms.csv")
#
# # basic stats
# quick_numeric_stats(map_class_subject, n_subjects)
#
# # top 20 print
# top20_classes <- map_class_subject %>%
# arrange(desc(n_subjects)) %>%
# slice_head(n = 20)
#
# print(top20_classes)
#
# # histogram
# ggplot(map_class_subject, aes(x = n_subjects)) +
# geom_histogram(binwidth = 5, fill = "steelblue", color = "steelblue") +
# labs(
# title = "Distribution of Number of Subjects per Class Term",
# x = "Number of Subjects",
# y = "Count of Class Terms"
# )
#
# # top20
# ggplot(top20_classes, aes(x = reorder(classterms, n_subjects), y = n_subjects)) +
# geom_col(fill = "steelblue") +
# coord_flip() +
# labs(
# title = "Top 20 Class Terms by Number of Subject Terms",
# x = "Class Term",
# y = "Number of Subjects"
# )