Setup

Load data from proquest on 2020-2024-02-01 sociology department dissertations in English. Clean and tokenise as 2-skip-1 ngrams

Code
if (!file.exists("data/coded_abstracts.xlsx") || 
    !file.exists("data/ngram_seeds.xlsx") || TRUE
    ) {
requireNamespace("googledrive")
folderid <- googledrive::as_id("https://drive.google.com/drive/folders/1eFat4mCoRZX22gPLTj9w5Bn63U3q71yo")

filels <- googledrive::drive_ls(folderid)

googledrive::drive_download(
  file = filels %>% filter(name=="abstracts_hand_coding") %>% pull(`id`),
  path = "data/coded_abstracts",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="socabs_topic_seeds") %>% pull(`id`),
  path = "data/ngram_seeds",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="stems_and_stops") %>% pull(`id`),
  path = "data/stems_and_stops",
  type= "xlsx",
  overwrite=TRUE)

 rm(folderid, filels)
}
Loading required namespace: googledrive
! Using an auto-discovered, cached token.
  To suppress this message, modify your code or options to clearly consent to
  the use of a cached token.
  See gargle's "Non-interactive auth" vignette for more details:
  <https://gargle.r-lib.org/articles/non-interactive-auth.html>
ℹ The googledrive package is using a cached token for 'micah.altman@gmail.com'.
Auto-refreshing stale OAuth token.
File downloaded:
• 'abstracts_hand_coding' <id: 1FO5AjIpe0iIgLgTaHBK8b7OLUR29JY9Z1aGxOWZkJ9U>
Saved locally as:
• 'data/coded_abstracts.xlsx'
File downloaded:
• 'socabs_topic_seeds' <id: 1l9I2kvMUFkVpmcMfm2pfcXCyU18fX4En5ptfu9qu8zo>
Saved locally as:
• 'data/ngram_seeds.xlsx'
File downloaded:
• 'stems_and_stops' <id: 1sp16X7rED97CXjHFloUOOg10A2mb3VaH3Dyfxjp-dv0>
Saved locally as:
• 'data/stems_and_stops.xlsx'
Code
coded_abs.df <- readxl::read_excel("data/coded_abstracts.xlsx")
Code
# retrieve and merge abstracts data
rawdir <- "data/raw"
mergedfile <- "data/diss_meta.RDS"

retrieve_raw_data <- function(dest) {
  requireNamespace("googledrive")
  folderid <- 'https://drive.google.com/drive/folders/1BFMQkwj24zwH3uTIFmrCnp32dkfY807E'
  
  filels <- googledrive::drive_ls(folderid)
 
  purrr::pwalk(filels, function(name,id,...) {
    googledrive::drive_download(
       file = id,
       path = file.path(dest,name),
      overwrite=TRUE)
    })
}

merge_raw_data<-function(dest) { 
   xls.ls <- dir(destdir,pattern="\\.xls", full.names = TRUE)
   ris.ls <- dir(destdir,pattern="\\.ris", full.names = TRUE)
   
   merged_xls.df <- purrr::map(xls.ls, readxl::read_excel) %>% 
     purrr::list_rbind()
   
   
   merged_ris.df <- purrr::map(ris.ls, revtools::read_bibliography) %>% 
     purrr::list_rbind()
   
   merged.df <- merged_xls.df %>% 
     left_join( merged_ris.df %>% select(advisor=author,StoreId=accession,
                      organization=publisher, place = "pubplace"),
                by = "StoreId" )
   
   merged.df
}

if (!file.exists(mergedfile)) {
  if (length(dir(destdir, pattern = "\\.ris$"))==0) {
      retrieve_raw_data(destdir)
  }
  diss_meta_full.df <- merge_raw_data(destdir)  
  saveRDS(diss_meta_full.df, file=mergedfile)
} 

diss_meta_full.df <- readRDS(mergedfile)
Code
abs1.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.xls")
abs2.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.2.xls")
diss_meta.df <- bind_rows(abs1.df, abs2.df)
rm("abs1.df","abs2.df")

### text cleaning

clean_text<-function(x) {
  x %>%  
    stringr::str_replace_all("U\\.S\\.","USA") %>% 
    stringr::str_to_lower() %>% 
    stringr::str_replace_all("[-\\)\\(\\&\"/]"," ") %>%
    stringr::str_replace_all("[^a-zA-Z \\.']", "")  %>% 
    stringr::str_squish()
}


# NOTE:
# Columns in source overal with each other -- only selected columns used
#  - all duplicates: classification, subjectClassifications, classiifcationCOdes, majorClassificationsCodes
# - subjectTerms appears to be an automated coarse recoding of the classification
# - appears author assigned, duplicate columns: identifierKeywords, subjects
#   apparently post-processed to add "GenderWatch" and "y" tags 

diss_cleaned.df <- diss_meta.df %>% 
  select(isbn, Authors, classification, subjectTerms, pubdate,
         Abstract, Title, identifierKeywords) %>%
  mutate(classification = replace_na(classification,""),
         subjectTerms = replace_na(subjectTerms,""),
         identifierKeywords = replace_na(identifierKeywords,""),
         pubyear_clean = year(as_date(pubdate,format="%Y"))
         ) %>%
  rowwise() %>%
  mutate(classification_clean = 
          str_split_1(classification ,pattern = ",") %>% 
          str_squish() %>%
          str_replace("^[0-9]+ ","") %>%
          clean_text()  %>%
          unique() %>%
          list() ,
         subject_terms_clean = str_split_1(subjectTerms,pattern = ",") %>%
           clean_text() %>% 
           unique() %>%
           list(), 
         au_identifier_terms_clean =
           str_split_1(identifierKeywords,pattern = ",") %>%
           clean_text() %>% 
           unique() %>%
           list(),
        abstract_clean = 
          clean_text(Abstract),
        title_clean =
          clean_text(Title)
        
  ) %>% 
    ungroup() %>%
    select(isbn, Authors, classification_clean, subject_terms_clean, pubyear_clean,
           Abstract,Title,identifierKeywords, au_identifier_terms_clean, title_clean, abstract_clean)

diss_cleaned.df %>%
  count(pubyear_clean) %>%
  rename(year=pubyear_clean)-> dy.df
dy.ls <- dy.df %>% pull(n)
names(dy.ls) <- dy.df %>% pull(year)

rm(diss_meta.df)

Author Characteristics

Code
requireNamespace("opengender")
Loading required namespace: opengender
Code
diss_cleaned_plus_gender.df <- diss_cleaned.df %>% 
  mutate(given=  str_split_i(Authors,pattern=",",2)
         %>% str_squish() %>% 
           str_split_i(pattern="[:space:]",1)) 

diss_cleaned_plus_gender.df %<>% opengender::add_gender_predictions(dict= "wgen2")

diss_cleaned_plus_gender.df %>% 
  summarize(opengender::gender_mean(og_details,simplify_output="row"))
# A tibble: 1 × 3
  prop_F prop_M   prop_O
   <dbl>  <dbl>    <dbl>
1  0.573  0.427 8.76e-19

Text prep

Code
# subset of the snowball stopwords

minimal_stopwords <-
  c("a",  "am", "an", "and", "any", "are", "aren't", "as", "at", "be","but", "by",  "did", "didn't", 
"do", "does", "doesn't", "doing", "don't", "down", "during", 
"each", "for", "from", "further", "had", "hadn't", "has", 
"hasn't", "have", "haven't", "having", "how", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "my", "myself",  "of", "on",  "or", "other",  "so", "than", "that", "that's", "the","their", "they", "them", "then", "there", "there's", "these", "this", "how", "to", "too",  "was", "wasn't", "when", "when's", "where", "where's", "which", "while", "will", "with", "won't")

combined_tidy.df <-  diss_cleaned.df %>%
  select(isbn,abstract_clean,title_clean,au_identifier_terms_clean) %>%
  unite(col="Clean_combined", sep=" ", remove=TRUE, na.rm=TRUE,
        abstract_clean,title_clean,au_identifier_terms_clean) %>%
  unnest_tokens(ngram, "Clean_combined", token = "skip_ngrams", n = 2,k=1,
                stopwords=minimal_stopwords)  %>%
  mutate(ngram = str_squish(ngram)) %>% 
  filter(str_length(ngram)>1)

#%>%
#  filter(!str_detect(ngram,"\\.")) %>%
#  filter(ngram!="")
Code
if (FALSE) {
  combined_tidy.df %<>%
     mutate(ngram = textstem::stem_strings(ngram, language="english"))
}

stop_post.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "stopwords")

stem_raw.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "groupwords")

stem_post.df <- stem_raw.df %>%
  dplyr::rowwise() %>%
  mutate( root = stringr::str_split_1(`group`, pattern="; ")[[1]],
          stem = stringr::str_split(`group`, pattern="; ")) %>%
  dplyr::ungroup() %>%
  tidyr::unnest(stem) %>%
  dplyr::filter(root!=stem) %>% 
  select(!group)
  
stem_string <- function(x,delim=" ") {
  longword.df <- stringr::str_split(x,delim)[[1]] %>%
    as_tibble()
  
  longword.df %<>% left_join(stem_post.df, by=c(value="stem"))
  longword.df %<>% mutate(result=coalesce(root,value))
  longword.df %>% pull(result) %>% paste(collapse=delim)
}

stem_reg<-function(x, pattern, delim="; ") {
  pattern_vec=str_split(pattern,delim)[[1]]
  replace_target = paste0(" ",pattern_vec[1]," ")
  replace_pattern = paste0("((^| )",pattern_vec[2:length(pattern_vec)],"($| ))",collapse="|")
  str_replace_all(x,pattern=replace_pattern,replacement=replace_target) %>%
  str_squish()
}

# 10000x faster than join version
stem_string_reg <- function(x) {
  
  reglist <- stem_raw.df %>% pull(group)
  cum <-x
  for (g in reglist) {
     cum <-  stem_reg(cum,g)
  }
  cum
}

#stem_string_reg(c("racial racialize race baiting","foo race", "bar gendered"))

stem_string_V<- Vectorize(stem_string)

combined_tidy_stem.df <- combined_tidy.df %>%
  mutate(ngram=stem_string_reg(ngram)) %>%
  ungroup()
Code
combined_tf_idf_stemmed.df <- 
  combined_tidy.df %>% 
  count(ngram,isbn) %>%
    bind_tf_idf(ngram, isbn, n)

combined_tf_idf_stemmed.df  %<>%
  anti_join(stop_post.df, by="ngram")

Distributions of topics (unclustered)

Dissertations by Controlled Classifications

Code
diss_cleaned.df %>% 
  rename(term=classification_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term)) -> diss_class_tidy.df

ndis <- sum(dy.ls)
lower_q <- .05
upper_q <- .8

{diss_class_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation assigned to controlled classifications")} %>% plotly::ggplotly()
Code
{diss_class_tidy.df %>% 
  filter(pubyear_clean < 2024) %>%
  count(term, pubyear_clean, sort=TRUE) %>%
  rowwise() %>%
  mutate(p=n/ndis,
         p_year=n/dy.ls[[as.character(pubyear_clean)]]) %>%
  filter(p_year >=  lower_q ,
         p_year <=  upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p_year))+
  geom_col() +
  coord_flip() +
  labs(x="%age dissertations in assigned  classification over time") +
  facet_wrap(vars(pubyear_clean))} %>% plotly::ggplotly() 
Code
rm(diss_class_tidy.df, lower_q, upper_q)

Dissertations by Author-Assigned Topics

Code
stop_topics.df <- tibble(term=c("y","genderwatch"))

diss_cleaned.df %>% 
  rename(term=au_identifier_terms_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term))  %>%
  anti_join(stop_topics.df,by="term") %>%
  distinct() -> diss_au_id_tidy.df

lower_q <- .025
upper_q <- 1

{diss_au_id_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation by author-assigned keywords")} %>% plotly::ggplotly()
Code
rm(diss_au_id_tidy.df, stop_topics.df)

Dissertations by Terminology

Code
unc_doc_freq.df <-
  combined_tf_idf_stemmed.df %>% 
  count(ngram) %>%
  mutate(p=n/sum(dy.ls))

lower_q <- .25 
upper_q <- .75

unc_doc_freq.df  %>% 
      slice_max(order_by=n, n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="terms appearing in most dissertations - excluding stopwords") 
Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
Some words could not fit on page. They have been removed.

Code
{unc_doc_freq.df %>% 
  mutate(p = n/sum(dy.ls),
         ngram=fct_reorder(ngram,p)) %>%
  filter(p>= lower_q,
         p<= upper_q) %>%
  ggplot(aes(x=ngram,y=p))+
  geom_col() +
  coord_flip() +
  labs(x="terms appearing in [25%-75%] of dissertations (excluding stopwords, min 1%)")} %>% plotly::ggplotly()
Code
unc_doc_freq.df  %>% 
    slice_max(order_by=n, n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="ngrams appearing in most dissertations"
    )
Code
rm(unc_doc_freq.df, lower_q, upper_q)

Distribution of Terminology Across Corpus

Code
unc_ngram_freq.df <- 
  combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarise(n=sum(n), .groups="drop") %>%
  arrange(n)

unc_ngram_freq.df  %>% 
      slice_max(order_by = n,n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="most popular uncontrolled terms - excluding stop words") 

Code
unc_ngram_freq.df  %>% 
    slice_max(order_by = n,n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most frequent 1 and 2 word terms in uncontrolled description"
    )
Code
rm(unc_ngram_freq.df)

Most Distinctive Terms in Each Dissertation

Code
nterms <- 5
ndiss <- 100

topterms.df <-
  combined_tf_idf_stemmed.df %>% 
  group_by(isbn) %>%
  slice_max(order_by=tf_idf, n=nterms) 

topdis.df <-
  topterms.df %>%
  group_by(isbn) %>%
  summarize(mean_tf_idf = mean(tf_idf)) %>%
  ungroup() %>%
  slice_max(order_by=mean_tf_idf, n= ndiss)
  
distinctive_diss.df <-
  left_join(topdis.df %>% select(isbn),
            topterms.df %>% select(isbn, ngram),
            by = "isbn") %>% 
  group_by(isbn) %>%
  summarize(distinct_terms = paste(ngram, sep =" ", collapse=", ")) %>% 
  ungroup() %>%
  left_join(diss_cleaned.df %>% select(Title,isbn), by="isbn")

distinctive_diss.df %>%
  relocate(Title, distinct_terms) %>%
  select(-isbn) %>%
  DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most distinctive terms in dissertations with distinctive terms"
    )
Code
rm(nterms,ndiss, topterms.df, topdis.df, distinctive_diss.df)

Topic Coding

Hand Coded Abstracts

Code
coded_isbns.df <- NULL
Code
hand_codes_tmp.df <-
  coded_abs.df %>% select(isbn,contains(':'),EpiStyle_type) %>% 
  filter( if_any(everything(), ~ !is.na(.x)) )

hand_codes_tmp.df %<>% 
  mutate(EpiStyle_type = str_to_lower(EpiStyle_type)) %>%
  #select(isbn,EpiStyle_type) %>%
  pivot_wider(names_from=EpiStyle_type, values_from=EpiStyle_type,
              names_prefix="epi:") %>% 
  select(-"epi:NA") %>%
  mutate(across(!isbn, 
                  ~ case_when(is.na(.x) ~ FALSE,  .default = TRUE)))

hand_codes_tmp.df %<>%
  pivot_longer(!isbn) %>%
  rename(topic=name) %>%
  mutate(coding_src="hand")

coded_isbns.df %<>% 
  bind_rows(hand_codes_tmp.df)
 
hand_code_tot.df <-
   hand_codes_tmp.df %>% 
    select(-isbn,coding_src) %>%
    filter(value) %>%
    count(topic,value) %>%
    select(-value) %>%
    separate_wider_delim(topic,
                         names=c("dimension","category"),delim=":")

{hand_code_tot.df %>%
    mutate(dimension = as.factor(dimension),
           category=as.factor(category)) %>%
    ggplot(aes(y=n,x=dimension,  fill=category,label=category)) +
    geom_col(position=position_dodge2()) +
    geom_text(position=position_dodge2(width=1), vjust=1) +
    theme(legend.position="none") + 
    facet_wrap(vars(dimension), scales="free_x")} %>%
  plotly::ggplotly()
Code
rm(hand_codes_tmp.df, hand_code_tot.df) 

Exact Match to Coded Topics Terms

Code
seeds_from_codes.df <- coded_abs.df %>% 
  select(contains(':')) %>% 
  pivot_longer(cols=everything(), names_to="topic", values_to = "ngram") %>% 
  filter(!is.na(ngram) & ngram != 'x') %>%
  separate_longer_delim(ngram,';') %>%
  mutate(ngram=str_squish(ngram)) %>%
  filter(ngram!='')

seeds_from_ngrams.df <- 
  suppressMessages( readxl::read_excel("data/ngram_seeds.xlsx") )%>%
  rename(topic=Topic,ngram=Ngram) %>% select(topic,ngram)

seeds_from_ngrams.df %<>% 
  separate_longer_delim(topic,";") %>% 
  filter(!is.na(`topic`) & str_detect(`topic`,":")) %>%
  mutate(topic = str_squish(`topic`))  %>%
  filter(topic!="")

seeds.df <- bind_rows(seeds_from_codes.df, seeds_from_ngrams.df) %>% 
  mutate(ngram = str_to_lower(ngram) %>% 
           str_replace_all(pattern="-"," ") %>%
           str_squish()
           ) %>%
  distinct()

rm(seeds_from_codes.df, seeds_from_ngrams.df)
Code
exact_matches.df <- 
  left_join(seeds.df,
            combined_tf_idf_stemmed.df %>% distinct(),
            relationship="many-to-many",
            by = "ngram"
            )

exact_matches.df %>% 
  select(topic,ngram,isbn) %>%
  distinct() %>%
  group_by(topic,ngram) %>%
  summarize(n=n(), per_dis=n()/sum(dy.ls), .groups="drop") %>%
  arrange(desc(per_dis)) %>% 
  slice_head(n=15) %>%
  gt::gt() %>%
  gt::tab_header(title="most frequently matched topic terms")
most frequently matched topic terms
topic ngram n per_dis
meth:empirical interviews 686 0.3664530
meth:qual interviews 686 0.3664530
claim:desc examine 542 0.2895299
claim:desc examines 471 0.2516026
subj:health health 455 0.2430556
subj:gender gender 438 0.2339744
scope:us united states 435 0.2323718
subj:econ economic 405 0.2163462
subj:race race 404 0.2158120
meth:theory theory 377 0.2013889
subj:race racial 345 0.1842949
meth:qual qualitative 343 0.1832265
subj:education education 328 0.1752137
subj:inequality inequality 321 0.1714744
claim:desc explore 318 0.1698718
Code
nmatched_dis <- exact_matches.df %>% pull(isbn) %>% unique() %>% length

exact_matches.df %<>% 
  select(topic,isbn) %>% 
  distinct() %>%
  mutate(value=TRUE, coding_src="exact")

exact_matches.df %>%
  count(topic) %>%
  ungroup() %>%
  separate_wider_delim(topic,names=c("dimension","category"),delim=":") -> topic_sum.df 

topic_sum.df %<>% 
  group_by(dimension) %>%
  mutate(p=n/sum(dy.ls)) %>%
  ungroup()

lower_q <- .15 
coded_isbns.df %<>% 
  bind_rows(exact_matches.df)

{topic_sum.df%>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms")
  } %>% plotly::ggplotly()
Code
{topic_sum.df%>%
  filter(p>lower_q) %>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms, excluding rare topics")
  } %>% plotly::ggplotly()
Code
topic_sum.df %>% 
  group_by(dimension) %>%
  gt::gt() %>%
  gt::fmt_percent(columns="p")
category n p
claim
desc 1538 82.16%
mod 778 41.56%
strong 321 17.15%
weak 292 15.60%
epi
activist 108 5.77%
constructive 31 1.66%
pos 30 1.60%
util 92 4.91%
meth
empirical 1599 85.42%
longitudinal 218 11.65%
qual 1104 58.97%
quant 845 45.14%
theory 668 35.68%
scope
us 1069 57.10%
world 373 19.93%
subj
class 450 24.04%
crim 418 22.33%
econ 868 46.37%
education 558 29.81%
environment 317 16.93%
ethnicity 290 15.49%
family 604 32.26%
gender 561 29.97%
health 646 34.51%
identity 388 20.73%
immigration 271 14.48%
inequality 1080 57.69%
lifecourse 202 10.79%
media 238 12.71%
movements 263 14.05%
networks 257 13.73%
orgs 557 29.75%
politics 607 32.43%
race 700 37.39%
religion 118 6.30%
rural 100 5.34%
sexuality 185 9.88%
stigma 90 4.81%
urban 280 14.96%
violence 202 10.79%
youth 178 9.51%
Code
rm(lower_q,exact_matches.df)

Data Source Matches

Determine geographical scope by implication from referenced data source.

Code
dataset_codes.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "datasets") %>%
      mutate(phrase_clean=clean_text(phrase))

dataset_codes_ls.df <-
  dataset_codes.df %>%
  group_by(topic) %>%
  summarise(phrase_list=list(phrase_clean), .groups="drop") %>%
  rowwise() %>% 
 # mutate(reg=paste0("((^| )",phrase_list,"( |$))",collapse="|")) %>% 
 # much slower -- in this case, we can assume that dataset names don't come
 # at very beginning of end
  mutate(reg=paste0("( ",phrase_list," )",collapse="|")) %>% 
  ungroup()

data_matches.df<- purrr::pmap(dataset_codes_ls.df,
    function(topic,reg,...) {
              diss_cleaned_plus_gender.df %>% 
                filter(stringr::str_detect(abstract_clean, reg)) %>% 
                select(isbn) %>% 
                mutate(topic=topic)
    } 
) %>% purrr::list_rbind() %>%
  distinct() %>% 
  mutate(coding_src="data")

coded_isbns.df %<>% 
  bind_rows(data_matches.df)

data_matches.df %>% 
  count(topic) %>%
  gt::gt()
topic n
scope:us 156
scope:world 7

Topic Models

Code
# tidy helper for keyATM
tidy.keyATM_output<-function(x, matrix="phi", long=FALSE) {
  if (matrix=="phi") {
    res  <- x[["phi"]] %>%
        tibble::as_tibble(rownames=NA) %>%
        tibble::rownames_to_column("topic") 
    if(long) {
      res <-  res %>% 
        tidyr::pivot_longer(!topic) %>%
        dplyr::mutate(term=paste("phi",topic,name,sep="+"),param="phi" ) %>%
        dplyr::relocate(term,estimate=value,topic,word=name)
    }
  } else {
    res <- lda_key[["theta"]]  %>%
        tibble::as_tibble(rownames=NA) %>%
        tibble::rownames_to_column("document")
        if(long) {
    res <-  res %>% tidyr::pivot_longer(!document) %>%
        dplyr::mutate(term=paste("theta",document,name,sep="+"),param="theta" ) %>%
        dplyr::relocate(term,estimate=value,document,topic=name)
    }
  }
  res
}
Code
requireNamespace("quanteda")
Loading required namespace: quanteda
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"pcorMatrix" of class "replValueSp"; definition not updated
Code
requireNamespace("keyATM")
Loading required namespace: keyATM
Code
topic_lower_q <- .15
unseeded_topics <- 3
seed_prior_strength <- .95
n_subj_categories <- 12
lower_q <-  topic_lower_q
upper_q <- .9
exclude_dim <- c("claim")

#stemming and grouping

combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarize(n=n()) %>%
  mutate(p=n/sum(dy.ls)) -> ngram_diss_ct.df

core_ngrams.df  <-
  ngram_diss_ct.df %>%
  filter( p >= lower_q, p<= upper_q) %>%
  select("ngram") %>%
  bind_rows(seeds.df %>% select(ngram))  %>%
  distinct() %>%
  left_join(combined_tf_idf_stemmed.df %>% select(ngram,isbn,n)) %>%
  na.omit() %>%
  distinct()
Joining with `by = join_by(ngram)`
Code
core_dfm <- 
  core_ngrams.df %>%
  rename(term=ngram, document=isbn, value=n) %>% 
  cast_dfm(document=document,term=term,value=value)

keyATM_docs <- keyATM::keyATM_read(texts = core_dfm, keep_docnames = TRUE)
ℹ Using quanteda dfm.
Code
index2isbn.df <- tibble(index=keyATM_docs[["doc_index"]], isbn=keyATM_docs[["docnames"]])

index2isbn.df %<>% left_join(diss_cleaned_plus_gender.df %>% select(isbn,Title), by="isbn")

excluded_topics<- topic_sum.df %>%
  filter(p<topic_lower_q) %>% 
  select(dimension,category) %>% 
  transmute(topic=paste(dimension,category,sep=":"))

excluded_areas <- topic_sum.df %>%
    filter(dimension %in% exclude_dim) %>%
      transmute(topic=paste(dimension,category,sep=":"))

excluded_subjects <- topic_sum.df %>%
 filter(dimension=="subj") %>% 
 arrange(desc(p)) %>% 
 slice_tail(n=-1*n_subj_categories ) %>%
 select(dimension,category) %>% 
 transmute(topic=paste(dimension,category,sep=":"))

seeded_topics.ls <- seeds.df %>% 
  anti_join(bind_rows(excluded_topics,excluded_subjects, excluded_areas), by ="topic") %>%
  group_by(`topic`) %>% 
  summarise(ngram_list=list(ngram)) %>% 
  pmap( function(topic,ngram_list) { x<-list(); x[[topic]] <- ngram_list; x} ) %>% 
  list_flatten() 

key_viz <- keyATM::visualize_keywords(docs = keyATM_docs, 
                                      keywords = seeded_topics.ls)
Warning: Keywords are pruned because they do not appear in the documents: in depth
interviews, semi structured interviews, american time use survey, u.s., centers
for disease control, american community survey, department of veterans affairs,
american time use survey, in asian countries, united states and europe, in
france, low income countries, middle income countries, high income countries,
world bank lending, with criminal, their school, their children, …, these
organizations, and their racial
Code
key_viz
Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_label_repel()`).
Warning: ggrepel: 628 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Code
suppressMessages(
  lda_key <- keyATM::keyATM(
  docs              = keyATM_docs,    # text input
  no_keyword_topics = unseeded_topics,              # number of topics without keywords
  keywords          = seeded_topics.ls,       # keywords
  priors            = list(beta_s=seed_prior_strength),
  model             = "base",         # select the model
))
Warning: Keywords are pruned because they do not appear in the documents: in depth
interviews, semi structured interviews, american time use survey, u.s., centers
for disease control, american community survey, department of veterans affairs,
american time use survey, in asian countries, united states and europe, in
france, low income countries, middle income countries, high income countries,
world bank lending, with criminal, their school, their children, …, these
organizations, and their racial
Code
lda_key %>% keyATM::plot_modelfit()

Code
keyATM::semantic_coherence(lda_key, keyATM_docs, n = 10)  -> lda_key.sem
lda_key.sem %>% as_tibble(rownames="topic") %>% rename(coherence=value) %>% gt::gt()
topic coherence
1_meth:empirical -55.30294
2_meth:qual -54.87866
3_meth:quant -49.68228
4_meth:theory -71.94672
5_scope:us -52.64626
6_scope:world -80.31968
7_subj:class -70.60734
8_subj:crim -68.31088
9_subj:econ -52.12322
10_subj:education -43.85082
11_subj:family -66.52815
12_subj:gender -56.89063
13_subj:health -62.55667
14_subj:identity -90.62521
15_subj:inequality -88.46208
16_subj:orgs -73.20833
17_subj:politics -64.45310
18_subj:race -49.35721
Other_1 -111.07606
Other_2 -107.55990
Other_3 -78.48870
Code
lda_key %>% keyATM::top_words() %>%
  gt::gt() %>% 
  gt::tab_header("Top words for each topic")
Top words for each topic
1_meth:empirical 2_meth:qual 3_meth:quant 4_meth:theory 5_scope:us 6_scope:world 7_subj:class 8_subj:crim 9_subj:econ 10_subj:education 11_subj:family 12_subj:gender 13_subj:health 14_subj:identity 15_subj:inequality 16_subj:orgs 17_subj:politics 18_subj:race Other_1 Other_2 Other_3
data interviews [✓] data network states environmental class [✓] police [✓] economic [✓] students [✓] family [✓] gender [✓] health [✓] identity [✓] inequality [✓] organizations [✓] political black [✓] neighborhood migration immigrant
empirical [✓] community results networks united world [✓] workers [✓] criminal [✓] labor [✓] school [✓] children [✓] women mental [✓] religious urban organizational [✓] state racial [✓] media migrants immigrants
survey cultural factors theory [✓] united states [✓] development work law [✓] work education [✓] families [✓] sexual mental health [✓] sexual disparities [✓] management [✓] policy race [✓] social media migrant immigration
effects practices effects status state national middle class [✓] justice [17] workers [7] college [✓] parents [✓] violence medical [✓] queer discrimination [✓] practices movements white [✓] neighborhoods latinx status
examine experiences associated support policy sustainability labor [9] incarceration [✓] economy [✓] educational [✓] life gendered [✓] life lgbtq rural work politics [✓] ethnic change violence usa [5]
relationship people higher social networks american [✓] international [✓] worker [✓] legal [✓] employment [✓] schools [✓] adulthood [✓] experiences healthcare [✓] religion inequalities [✓] public public racism [✓] climate change refugees legal [8]
results work rates [✓] relationships new nations [5] race [18] policing [✓] job [✓] higher marriage [✓] feminist [✓] covid [✓] stigma [15] barriers [✓] organization [✓] justice [✓] indigenous [✓] collective community second generation
factors new support online public transnational [✓] working class [✓] prison [✓] labor market [✓] student [✓] relationships transgender [✓] pandemic [✓] people equity [✓] activists environmental ethnicity individuals youth migration
new process outcomes role local [✓] cultural women punishment [✓] occupational [✓] high childhood [✓] men women [✓] older adults sexuality gentrification [✓] change welfare [✓] racialized [✓] online rural united states [5]
outcomes depth survey social network political human rights [17] social class [✓] violence markets [✓] higher education [✓] work gender inequality [✓] patients [✓] experiences intersectional [✓] movements argue black women [✓] political usa [5] states
Code
lda_key %>% keyATM::top_docs(n=5)%>% 
  pivot_longer(cols=everything()) %>%
  rowwise() %>%
  left_join(index2isbn.df, by=c(value="index")) %>%
  ungroup() %>% 
  select(-value,isbn) %>%
  group_by(name) %>%
  gt::gt() %>%
  gt::tab_header("Top dissertations in each topic")
Top dissertations in each topic
isbn Title
1_meth:empirical
9798380595100 Computational Social Science for Sociological Research
9798672178813 Rednecks and Hillbillies: A Thematic Analysis of the Construction of Pride and High Self-Esteem Exhibited by Southern Characters
9798380836685 Creativity on Demand: Cognition, Materiality, and Sociality in Improvisation
9798534691252 Roaring Tiger or Bounce Tigger? Examining Corruption Seriousness Perception in China from a Socio-Cultural Perspective
9798379570682 Three Papers on the Moral Perceptions of Scientific Misconduct
2_meth:qual
9798834058755 The Ghost in the Machine: Organ Transplantation and the Phenomenology of Embodiment
9798380135238 Narrative Power: Social Control in ISLs &amp; Group Homes
9798381413083 Sociological Silhouettes: The Archaeology and Demography of Knowledge
9798357574244 Within the Waterline: An Exploratory Qualitative Study of How Social Interactions between Leadership Coaches and Their Young Professional Clients Facilitates Leader-Identity Transformation
9798538105540 Excavating Palimpsests in Ponte City
3_meth:quant
9798762198066 Hurricanes and Heart Problems: Natural Disasters, Social Capital, and Cardiovascular Mortality
9798819384725 The Influence of Demographic, Situational, Psychological, and Motivational Factors in Mass Murder Attacks
9798569993567 Recidivism and Crime Specialization Among Male Inmates in Chinese Prisons
9798672157955 Inter Vivos Transfers in Twenty European Countries (2004-2017)
9798802704905 Shareholder Value Minimization?: How Some US Corporations Avoid Institutional Pressures
4_meth:theory
9798845428967 Witchcraft and Partisanship in Contemporary Society: The Case of QAnon
9798569924295 Testing the Intrinsic Benefit Model of the Signaling Theory
9798379718763 Group Cognition: Reclaiming C.S. Peirce’s Specialized Theory of Cognition and Its Implications for Sociology and the Cognitive Sciences
9798841728948 Peircean Pragmatism, Critical Realism, and Cultural Sociology
9798641784465 A Social Network Analysis of Online Gamers' Friendship Networks: Structural Attributes of Steam Friendships, and Comparison of Offline-Online Social Ties of MMO Gamers
5_scope:us
9798357537522 Permanent Austerity: A 40 Year Case Study of Chicago Public Finance
9798834026358 National Carbon Prices
9798351421940 Punctuated Policy Innovation: The Transformation of U.S. Gendered Pay Inequity Policy Alternatives, 1945-2019
9798460416622 The Caduceus of Modernity: States, Pirates, and the Battle for Sovereignty
9798380620642 Marketing the Missing: Missing Persons and the Competition for Concern
6_scope:world
9798516927935 The Age of Protest: World-Historical Structure and Dynamics of Protest Waves in the Global South in the Long Twentieth Century
9798380590273 Individual Empowerment in Multiple Modernities: A Quantitative Exploration of the Relationship Between Economic Development, Culture, and Civil Liberties
9798841749431 Reproduction of Nationhood: Citizenship and Immigration Policy in Turkey
9798351436784 Agent Orange and the Treadmill of Destruction: Environmental Degradation and Inequality in Vietnam
9798379914578 The Global Path of Human Rights: A Quantitative Analysis of Trajectories, Democracy, and Development
7_subj:class
9798780656401 Solidarity Infrastructure: Gender and Race Solidarity and Cross-Class Coalitions in the Kansas City General Strike of 1918
9798380852579 Dallas: Kinship, Mobility, and Inheritance in an Elite Population, 1895-1945
9798380858113 <strong>WORK, CLASS AMBIGUITY, AND MULTIPLE FEMININITIES</strong> Women Beauty and Retail Workers in Pakistan’s New Service Economy
9798790667619 Constructing the Perfect Girlfriend: Gender, Class, Race, &amp; Performativity of Paid Intimacy in Nevada Brothels
9798460487561 State, Street, Store: The Development of the Chinese Middle Class
8_subj:crim
9798460434251 The Ideology of the Carceral State: Examining the Prison through Film
9798834020844 Attitudes of Guangzhou Public toward Criminal Retrial: Predictors and Process from the Relationism Theory
9798358480247 Can Electronic Monitoring Fix Mass Incarceration?: Understanding the Role of Electronic Monitoring in Local Policy Reform Efforts
9781658492614 Analysis of the YLS/CMI Risk Assessment and Recidivism Prediction for Justice-Involved Youth
9798538120000 Breaking Out of Prison Culture? Or Culture Breaking Out of Prison?: Understanding Convict Code Endorsement during Incarceration and Post-Release
9_subj:econ
9798678131263 Bearing a Beloved Burden: Surrogates, Reproductive Labor, and Carrying Babies for Others
9798557097413 Bad Jobs or ‘Badass’ Jobs? Fishing Guides, Self-Employment, and Service Work in the New Economy
9798379575199 Changes in Occupational Structure and Labor Market Outcomes in China
9798492722500 Taylored Flexibility: Agile, Control, and the Software Labor Process
9798597058108 Government Regulation and Economic Globalization: The Diffusion of Antitrust Law and Merger Control and Its Influence on Cross-Border Mergers and Acquisitions
10_subj:education
9798834020516 Risk and Protective Factors in Adolescent Substance Use: Comparing Students of Regular High Schools and Vocational High Schools in China
9798535591438 Educational Trajectories of Indigenous Students: Vertical and Horizontal Stratification in the Chilean Educational System
9798380599337 Transition in Times of Transition: The Last Year of High School and the College-Going (Or Not-Going) Decision-Making Process of Senior High School Students in Chile
9798380595209 STEM Participation, Persistence, and Attrition Among First-Generation College Students
9798841795469 Encounters with Uncertainty: How Secondary Education Policies Shape Parental Involvement in Schooling in Contemporary Turkey
11_subj:family
9798357550293 The Consequences of Family Stress in the Work-Family Interface: Three Longitudinal Studies in Canada
9798802748862 Constructing Childhood through Social Interaction: Rights, Obligations, and Accountability in Adult-Child Interaction
9798379422578 Intergenerationality: The Making and Consequences of Parental Intervention in Children’s Marriage in China
9798678157829 Dissolution Pathways: Mother-Child Relationship Quality, Adolescent Academic Well-Being, and College Completion Among Young Adults
9798380176071 Three Essays on Responsibility and the Transition to Adulthood in Comparative Perspective
12_subj:gender
9798438745518 The Rape Paradox: The Effect of Anti-Sexual Assault Policies on Gender and Sexual Assault Attitudes, Donations to Sexual Assault Campaigns, and Policy Compliance
9798841754534 Gender Knowledge: Category, Status, Transgression, Policing, and Perception
9798845426468 Sexual Interaction in Action: Consent and Agency in the Sexual Communications of Early Adolescents and Adults
9798379586041 “Skanks Need to Pay with their Lives”: Online Harassment, Cyberstalking, and Violence Against Women Online
9798662468405 Reproducing Gender Inequality in Gaming: A Dual Analysis of the Gaming Hierarchy and Gamer Identity
13_subj:health
9798379620769 Surveillance Medicine in Perinatal Care: Negotiating Constraints, Constructing Risk, and the Elusive Goal of Mental Health Integration
9798379946050 Exploring Multiple Stakeholder Experiences of Healthcare: Opportunities for Intervention and Change to Improve Care and Outcomes for People with Mental Illness
9798819388891 Exploring Differential Exposure to Adverse Social Determinants of Health for Children with Developmental Disabilities and Their Related Health Outcomes and Resiliency
9798790654770 Educational Disparities in Chronic Pain and Life Expectancy: Gaps and Pathways
9798460476299 Multidimensional Health: Applications Across Cultures, Cohorts, and the COVID-19 Pandemic
14_subj:identity
9798607337148 Speaking Through the Silence: Narratives, Interaction, and the Construction of Sexual Selves
9798358499164 Institutionalized Normative Heterosexuality: The Case of Sexual Fluidity
9798664798906 Out in Front: How LGBQ Women and Genderqueer Clergy Navigate Religious Communities
9798582505327 Behind the Laughter: Health, Stigma and Communal Coping Among Standup Comedians
9798780637288 LGBTQ YouTube: Community and Branding through New Media
15_subj:inequality
9798534659009 Income Inequality and Caste in India: Evidence from India Human Development Surveys
9798662382343 The Social Etiology of Prescription Psychotherapeutic Misuse among Rural and Urban Adolescents and Emerging Adults: Concurrent and Prospective Stress-Process Analyses
9798374405644 The Rent Is Too Damn High: The Spatial and Longitudinal Dimensions of Housing Affordability
9798738630620 The Measurement and Social Consequences of U.S. Income Inequality
9798759981954 Urban-Rural Differences in Lifespan Variation in the United States
16_subj:orgs
9798678120151 Laissez Fairy Tales: Consensus, Cohesion, and Corporate Culture During the Collapse of Lehman Brothers
9798534666571 From Angry Militants to Happy Vegans: Animal Rights Activists' Recasting for Vegan Recruitment
9781658487306 Reinventing Diversity, Activists Interfering with the Managerialization of the Law
9798460471942 Fashioning Religious Selves in a Secular Age
9798834058038 Welcome Back a Yard: Jamaican Imaginings of the Diasporan Homebuyer
17_subj:politics
9798380091572 Erasing Violence: Political Contests over the Recognition of the Great Ukrainian Famine of 1932-1933
9798381107340 Unsettling Science: How Activists Deployed Science in the Conversion Therapy Debate
9798662595316 Militarism, Democracy, and Concordance: The Role of Citizenry in (Re)-Establishing Democratic Order in Argentina and Turkey
9798351456461 the Fraternal Twins: A Comparative Study of Hegemony Building, Political Mobilization and Crony Capitalism in Neo-Authoritarian Turkey and Russia (1991–2021)
9798841795353 Mobilizing Consumers: The American Consumer Movement in the 1960s–70s as a Social Movement
18_subj:race
9798352940266 Understanding “Our” Similarities and Differences Academically, Socially, and Psychologically: The Race-Gendered Experiences of Black Men and White Men Enrolled in STEM Doctoral Programs
9798662409545 The Macro and Micro Foundations of Racial Residential Segregation: A Contemporary Analysis
9798672129129 Social Stratification in the Mortgage Market Post the Great Recession
9798845451798 Internalizing Achievement Inequality: The Development of Racial/Ethnic Differences in Mathematics Attitudes and Their Implications for Persistence in STEM
9798351468013 Backdoor to Essentialism? Genetic Ancestry Testing and the Social Deconstruction of Whiteness
Other_1
9781392539118 Three Papers on Collective Action
9798780611493 Understanding Climate Change Discourse in China
9798480642933 How Social Media Affects Political Action: The Effects of Digital Network Structures and Motivations on Movement Participation
9798460436040 “Acts of Pure Evil”: The Portrayal of Mass Shooting Events on Online Media Platforms
9798380858816 Autism in Perpetual Motion: A Methodology for Examining Social Media Data via the Case of Autistic Influencers of Twitter, 2018-2019
Other_2
9798438794738 Knowledge, Place, and Experience in the Migrant Journey: How Central American Migrant Youth Negotiate Violence in Mexico
9798845452276 The Role of Community Context Factors in explaining International Migrant Flows and their Composition: Three Studies based on the Mexico-U.S. Case
9798380897396 The Emergence of Ina-Ethe Migration : Mpondo Men and Continued Migrant Labour Post-Apartheid
9798837553493 Intersubjective Understanding of Violence: The Lifeworld of Tajik Immigrant Workers in Post-Soviet Russia
9798841770886 Spatial Disparity and Educational Sorting of U.S. Internal Migration
Other_3
9798535509976 Legal Status Fluidity: Theorizing Legal Status Transitions and How Filipino Immigrants Navigate Immigration Pathways
9798351434360 Bureaucracies of Removal: The Labor and Logics of US Immigration Courts
9798837524691 Social and Economic Consequences of “Get Tough” on Immigration
9798538103768 Measuring the Effect of Immigrant Legal Status on Socioeconomic Outcomes: Variation by Legal Status Assignment Approach
9798381168358 Barriers to Legal Status Attainment and the Outcomes of Immigrants and Their Children
Code
{lda_key %>% keyATM::plot_topicprop(
  n = 5,
  show_topic = NULL,
  show_topwords = TRUE,
  label_topic = NULL,
  order = "proportion")}[["figure"]] %>%
  plotly::ggplotly()
Code
# documents with n% of words on specific topic

tidy(lda_key, matrix="theta", long=TRUE) %>%
  arrange(desc(estimate)) %>% 
  filter(estimate > .2)  %>%
  count(topic) %>%
  gt::gt() %>%
  gt::tab_header("number of dissertations with => 20% words from given topic")
number of dissertations with => 20% words from given topic
topic n
10_subj:education 181
11_subj:family 144
12_subj:gender 157
13_subj:health 257
14_subj:identity 85
15_subj:inequality 118
16_subj:orgs 76
17_subj:politics 257
18_subj:race 237
1_meth:empirical 528
2_meth:qual 792
3_meth:quant 183
4_meth:theory 104
5_scope:us 145
6_scope:world 57
7_subj:class 43
8_subj:crim 141
9_subj:econ 117
Other_1 48
Other_2 43
Other_3 51
Code
lda_matches.df <- tidy(lda_key, matrix="theta", long=TRUE) %>%
  arrange(desc(estimate)) %>% 
    filter(estimate > .2) %>%
    select(isbn=document, topic=topic) %>%
    mutate(topic=str_replace(topic,"\\d+_","")) %>%
    mutate(topic=str_replace(topic,"^Other","subj:other")) %>%
    distinct() %>%
    mutate(value=TRUE, coding_src="LDA")

coded_isbns.df %<>% filter(coding_src!="LDA")
coded_isbns.df %<>% bind_rows(lda_matches.df)

Bi-variate analysis

Code
coded_isbns_wide.df <- coded_isbns.df %>%
  select(-coding_src) %>%
  filter(value) %>%
  distinct()  %>% 
  pivot_wider(values_from=value, names_from=topic, values_fill=FALSE)

coded_isbns_wide_nest.df<- coded_isbns.df %>%
  filter(value) %>%
  select(isbn,topic) %>%
  distinct() %>%
  separate_wider_delim(topic,delim=":", names=c("dimension","category")) %>%
  pivot_wider(names_from="dimension", values_from=category,
              values_fn=list) %>% 
  left_join(diss_cleaned_plus_gender.df %>% select(isbn,og_pr_F), by="isbn") 
  
coded_isbns_wide_unpacked.df <-
  coded_isbns_wide_nest.df%>% select(isbn,subj,meth,scope,og_pr_F) %>%
  unnest(cols=c(subj), keep_empty =TRUE) %>%
  unnest(cols=c(meth), keep_empty =TRUE) %>%
  unnest(cols=c(scope) , keep_empty =TRUE) 

coded_isbns_gender.df <- 
  coded_isbns.df %>%
  ungroup() %>%
  filter(value) %>%
  select(!coding_src) %>%
  distinct() %>%
  left_join(diss_cleaned_plus_gender.df %>% select(isbn, og_pr_F), by="isbn") %>%
   separate_wider_delim(topic,names=c("dimension",
                                     "category"),delim=":")

coded_isbns.df %>%
  filter(value) %>%
  select(-coding_src) %>%
  distinct() %>%
  count(topic,sort=TRUE) %>%
  ungroup() %>%
  separate_wider_delim(topic,names=c("dimension",
                                     "category"),delim=":") ->   all_topic_sum.df 
all_topic_sum.df %<>% 
  group_by(dimension) %>%
  mutate(p=n/sum(dy.ls)) %>%
  ungroup()
Code
all_topic_sum.df
# A tibble: 47 × 4
   dimension category       n     p
   <chr>     <chr>      <int> <dbl>
 1 meth      empirical   1638 0.875
 2 claim     desc        1538 0.822
 3 meth      qual        1198 0.640
 4 subj      inequality  1081 0.577
 5 scope     us          1080 0.577
 6 subj      econ         868 0.464
 7 meth      quant        862 0.460
 8 claim     mod          778 0.416
 9 meth      theory       709 0.379
10 subj      race         701 0.374
# ℹ 37 more rows
Code
{all_topic_sum.df%>%
  filter(p>lower_q) %>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="ALL CODING METHODS: percent of dissertations in each category, excluding rare")
  } %>% plotly::ggplotly()
Code
coded_isbns_wide_unpacked.df %>%
  group_by(subj,meth,scope) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) %>%
  DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="subj x meth x scope frequencies"
    )
Code
coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,meth) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> meth_subj_freq.df 

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,scope) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> scope_subj_freq.df 

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  summarize(subject_total = length(unique(isbn)))  -> subject_totals.df

meth_subj_freq.df %>%
  mutate(display=glue::glue('{round(percent_of_subject,2)} ({n})')) %>%
  select(subj,meth,display) %>%
  pivot_wider(names_from=meth,values_from=display, values_fill="0") %>%
  gt::gt() %>%
  gt::tab_header("proportion of each subject employing methodology") %>%
  gt::cols_label(subj="") %>%
    gt::tab_caption("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")
Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.
proportion of each subject employing methodology
empirical longitudinal qual quant theoretical theory NA
class 0.9 (405) 0.14 (61) 0.69 (311) 0.45 (204) 0 (1) 0.35 (159) 0.03 (12)
crim 0.86 (360) 0.12 (49) 0.63 (265) 0.46 (194) 0 0.35 (148) 0.04 (15)
econ 0.89 (774) 0.13 (117) 0.64 (553) 0.48 (413) 0 0.38 (328) 0.03 (23)
education 0.88 (489) 0.14 (78) 0.6 (337) 0.55 (306) 0 0.39 (220) 0.03 (14)
environment 0.9 (285) 0.12 (37) 0.7 (221) 0.41 (130) 0 0.39 (123) 0.03 (8)
ethnicity 0.89 (259) 0.14 (40) 0.61 (178) 0.59 (171) 0 0.38 (109) 0.03 (9)
family 0.9 (546) 0.17 (103) 0.58 (349) 0.5 (304) 0 0.35 (211) 0.02 (14)
gender 0.9 (507) 0.1 (58) 0.64 (363) 0.5 (281) 0 (1) 0.39 (218) 0.03 (15)
health 0.9 (583) 0.15 (96) 0.59 (378) 0.56 (362) 0 0.39 (251) 0.02 (11)
identity 0.91 (375) 0.06 (25) 0.78 (324) 0.36 (147) 0 (1) 0.4 (167) 0.02 (9)
immigration 0.9 (245) 0.13 (36) 0.68 (185) 0.45 (122) 0 0.35 (95) 0.02 (6)
inequality 0.9 (969) 0.12 (127) 0.65 (698) 0.5 (537) 0 0.37 (397) 0.03 (28)
lifecourse 0.93 (187) 0.3 (61) 0.43 (87) 0.61 (123) 0 0.38 (76) 0.02 (4)
media 0.91 (216) 0.06 (14) 0.72 (172) 0.39 (94) 0 0.43 (102) 0.02 (5)
movements 0.86 (225) 0.07 (19) 0.79 (208) 0.34 (89) 0 0.41 (109) 0.03 (9)
networks 0.88 (226) 0.13 (34) 0.6 (154) 0.5 (129) 0 0.57 (146) 0.01 (2)
orgs 0.88 (497) 0.08 (46) 0.79 (443) 0.39 (222) 0 0.38 (213) 0.02 (10)
other_1 0.88 (42) 0.02 (1) 0.6 (29) 0.46 (22) 0 0.31 (15) 0.04 (2)
other_2 0.91 (39) 0.07 (3) 0.72 (31) 0.42 (18) 0 0.21 (9) 0.02 (1)
other_3 0.88 (45) 0.14 (7) 0.69 (35) 0.49 (25) 0 0.39 (20) 0.02 (1)
politics 0.85 (545) 0.11 (71) 0.71 (453) 0.38 (244) 0 0.37 (238) 0.04 (23)
race 0.88 (620) 0.11 (76) 0.66 (465) 0.49 (346) 0 (1) 0.38 (266) 0.03 (24)
religion 0.86 (102) 0.09 (11) 0.62 (73) 0.53 (63) 0 0.35 (41) 0.02 (2)
rural 0.88 (88) 0.12 (12) 0.6 (60) 0.48 (48) 0 0.34 (34) 0.05 (5)
sexuality 0.91 (169) 0.05 (10) 0.77 (143) 0.39 (73) 0.01 (1) 0.37 (68) 0.04 (7)
stigma 0.93 (84) 0.03 (3) 0.71 (64) 0.38 (34) 0 0.46 (41) 0.01 (1)
urban 0.87 (244) 0.14 (39) 0.64 (180) 0.5 (139) 0 0.35 (97) 0.03 (8)
violence 0.86 (174) 0.1 (20) 0.65 (132) 0.45 (90) 0 0.38 (76) 0.04 (9)
youth 0.94 (167) 0.29 (52) 0.54 (96) 0.54 (97) 0 0.35 (62) 0.01 (2)
NA 0.71 (5) 0 0.43 (3) 0.14 (1) 0 0.43 (3) 0
Code
meth_subj_freq.df %>%
  mutate(display=glue::glue('{round(pr_F,2)} ({n})')) %>%
  select(subj,meth,display) %>%
  pivot_wider(names_from=meth,values_from=display, values_fill="0") %>%
  gt::gt() %>%
  gt::tab_header("Female proportion by subject & method") %>%
  gt::cols_label(subj="") %>% 
  gt::tab_caption("Categories are overlapping. Number of dissertations in category in parentheses. ")
Categories are overlapping. Number of dissertations in category in parentheses.
Female proportion by subject & method
empirical longitudinal qual quant theoretical theory NA
class 0.64 (405) 0.54 (61) 0.67 (311) 0.57 (204) 0.96 (1) 0.59 (159) 0.59 (12)
crim 0.61 (360) 0.49 (49) 0.65 (265) 0.56 (194) 0 0.59 (148) 0.47 (15)
econ 0.59 (774) 0.51 (117) 0.61 (553) 0.57 (413) 0 0.53 (328) 0.62 (23)
education 0.64 (489) 0.55 (78) 0.67 (337) 0.62 (306) 0 0.58 (220) 0.65 (14)
environment 0.55 (285) 0.57 (37) 0.55 (221) 0.53 (130) 0 0.46 (123) 0.42 (8)
ethnicity 0.6 (259) 0.69 (40) 0.62 (178) 0.59 (171) 0 0.59 (109) 0.67 (9)
family 0.67 (546) 0.58 (103) 0.7 (349) 0.62 (304) 0 0.61 (211) 0.49 (14)
gender 0.73 (507) 0.7 (58) 0.77 (363) 0.7 (281) 0.96 (1) 0.72 (218) 0.7 (15)
health 0.63 (583) 0.61 (96) 0.68 (378) 0.6 (362) 0 0.59 (251) 0.46 (11)
identity 0.62 (375) 0.62 (25) 0.63 (324) 0.6 (147) 0.96 (1) 0.61 (167) 0.77 (9)
immigration 0.61 (245) 0.61 (36) 0.64 (185) 0.56 (122) 0 0.57 (95) 0.67 (6)
inequality 0.63 (969) 0.54 (127) 0.65 (698) 0.62 (537) 0 0.59 (397) 0.64 (28)
lifecourse 0.66 (187) 0.62 (61) 0.69 (87) 0.62 (123) 0 0.56 (76) 0.5 (4)
media 0.55 (216) 0.42 (14) 0.6 (172) 0.44 (94) 0 0.49 (102) 0.71 (5)
movements 0.57 (225) 0.56 (19) 0.58 (208) 0.54 (89) 0 0.47 (109) 0.42 (9)
networks 0.54 (226) 0.53 (34) 0.55 (154) 0.51 (129) 0 0.47 (146) 0.48 (2)
orgs 0.6 (497) 0.42 (46) 0.63 (443) 0.56 (222) 0 0.52 (213) 0.47 (10)
other_1 0.48 (42) NaN (1) 0.56 (29) 0.51 (22) 0 0.37 (15) 1 (2)
other_2 0.58 (39) 0.89 (3) 0.53 (31) 0.42 (18) 0 0.86 (9) 1 (1)
other_3 0.58 (45) 0.94 (7) 0.54 (35) 0.5 (25) 0 0.5 (20) 0 (1)
politics 0.57 (545) 0.46 (71) 0.58 (453) 0.52 (244) 0 0.51 (238) 0.53 (23)
race 0.61 (620) 0.6 (76) 0.63 (465) 0.61 (346) 0.96 (1) 0.56 (266) 0.61 (24)
religion 0.49 (102) 0.15 (11) 0.55 (73) 0.43 (63) 0 0.47 (41) 0.5 (2)
rural 0.62 (88) 0.46 (12) 0.63 (60) 0.52 (48) 0 0.6 (34) 0.67 (5)
sexuality 0.65 (169) 0.66 (10) 0.66 (143) 0.66 (73) 0.96 (1) 0.65 (68) 0.6 (7)
stigma 0.61 (84) 0.33 (3) 0.64 (64) 0.59 (34) 0 0.62 (41) 1 (1)
urban 0.52 (244) 0.53 (39) 0.53 (180) 0.52 (139) 0 0.43 (97) 0.58 (8)
violence 0.69 (174) 0.65 (20) 0.72 (132) 0.63 (90) 0 0.68 (76) 0.56 (9)
youth 0.61 (167) 0.52 (52) 0.72 (96) 0.55 (97) 0 0.63 (62) 1 (2)
NA 0.4 (5) 0 0.33 (3) 0 (1) 0 1 (3) 0
Code
coded_isbns_gender.df %>%
  group_by(dimension,category) %>%
  summarize(n=n(),p=n/sum(dy.ls), pr_F=mean(og_pr_F,na.rm=TRUE),
            .groups="drop") -> category_gender.df
  
{ category_gender.df %>%
    filter(p>lower_q) %>%
  ggplot(aes(y=pr_F,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="gender ratio of  each category, excluding rare")} %>%
  plotly::ggplotly()
Code
coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,meth) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> coded_subj_freq.df 

preprint formatted tables

Code
subject_totals.df
# A tibble: 30 × 2
   subj        subject_total
   <chr>               <int>
 1 class                 450
 2 crim                  419
 3 econ                  868
 4 education             558
 5 environment           317
 6 ethnicity             290
 7 family                606
 8 gender                563
 9 health                646
10 identity              413
# ℹ 20 more rows
Code
# proportions table
subject_totals.df %>% 
  filter(subj!="NA") %>%
  filter(!str_starts(subj,"other")) %>%
  left_join(by="subj", 
    meth_subj_freq.df %>%
    mutate(display=percent_of_subject) %>%
    select(subj,meth,display) %>%
    pivot_wider(names_from=meth,values_from=display, values_fill=0) %>%
    select(!"NA")) %>%
  left_join(by="subj", 
    scope_subj_freq.df %>%
    mutate(display=percent_of_subject) %>%
    select(subj,scope,display) %>%
    pivot_wider(names_from=scope,values_from=display, values_fill=0) %>%
    select(!"NA")
    ) %>%
  gt::gt() %>%
  gt::tab_header("Proportion of each subject by methodology and scope") %>%
  gt::cols_label(subj="") %>%
  gt::cols_hide(c("theoretical")) %>%
  gt::tab_spanner(
    label = "methodology",
    columns = c(
      "empirical","longitudinal","qual","quant","theory"
    )
  ) %>%
  gt::tab_spanner(
    label = "scope",
    columns = c(
      "us","world",
    )
  ) %>%
  gt::cols_label(subject_total="(N)") %>%
  gt::fmt_number(decimals=2) %>%
  gt::fmt_number(columns=subject_total,decimals=0) %>%
  gt::data_color(
     columns = !c("subj","subject_total"),
     direction = "row",
     method="numeric",
     domain=c(0,1),
     palette = "Oranges",
     na_color = "white"
   ) %>%
  gt::tab_footnote("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")
Proportion of each subject by methodology and scope
(N) methodology scope
empirical longitudinal qual quant theory us world
class 450 0.90 0.14 0.69 0.45 0.35 0.62 0.21
crim 419 0.86 0.12 0.63 0.46 0.35 0.61 0.20
econ 868 0.89 0.13 0.64 0.48 0.38 0.61 0.24
education 558 0.88 0.14 0.60 0.55 0.39 0.59 0.17
environment 317 0.90 0.12 0.70 0.41 0.39 0.56 0.29
ethnicity 290 0.89 0.14 0.61 0.59 0.38 0.71 0.17
family 606 0.90 0.17 0.58 0.50 0.35 0.59 0.17
gender 563 0.90 0.10 0.64 0.50 0.39 0.56 0.20
health 646 0.90 0.15 0.59 0.56 0.39 0.60 0.16
identity 413 0.91 0.06 0.78 0.36 0.40 0.53 0.23
immigration 271 0.90 0.13 0.68 0.45 0.35 0.71 0.33
inequality 1,081 0.90 0.12 0.65 0.50 0.37 0.61 0.20
lifecourse 202 0.93 0.30 0.43 0.61 0.38 0.66 0.13
media 238 0.91 0.06 0.72 0.39 0.43 0.52 0.24
movements 263 0.86 0.07 0.79 0.34 0.41 0.57 0.25
networks 257 0.88 0.13 0.60 0.50 0.57 0.56 0.23
orgs 563 0.88 0.08 0.79 0.39 0.38 0.60 0.22
politics 642 0.85 0.11 0.71 0.38 0.37 0.61 0.28
race 701 0.88 0.11 0.66 0.49 0.38 0.67 0.16
religion 118 0.86 0.09 0.62 0.53 0.35 0.65 0.26
rural 100 0.88 0.12 0.60 0.48 0.34 0.52 0.25
sexuality 185 0.91 0.05 0.77 0.39 0.37 0.55 0.17
stigma 90 0.93 0.03 0.71 0.38 0.46 0.48 0.13
urban 280 0.87 0.14 0.64 0.50 0.35 0.72 0.22
violence 202 0.86 0.10 0.65 0.45 0.38 0.62 0.21
youth 178 0.94 0.29 0.54 0.54 0.35 0.59 0.17
Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.
Code
coded_isbns_gender.df %>%
  filter(dimension=="subj") %>%
  group_by(category) %>%
  summarize(subject_gender=mean(og_pr_F,na.rm=TRUE),
            .groups="drop") %>%
  rename(subj=category) -> subject_genders.df


# gender table
subject_totals.df %>% 
  filter(subj!="NA") %>%
  filter(!str_starts(subj,"other")) %>%
  left_join(by="subj", subject_genders.df) %>%
  left_join(by="subj", 
    meth_subj_freq.df %>%
    mutate(display=pr_F) %>%
    select(subj,meth,display) %>%
    pivot_wider(names_from=meth,values_from=display, values_fill=0) %>%
    select(!"NA")) %>%
  left_join(by="subj", 
    scope_subj_freq.df %>%
    mutate(display=pr_F) %>%
    select(subj,scope,display) %>%
    pivot_wider(names_from=scope,values_from=display, values_fill=0) %>%
    select(!"NA")
    ) %>%
  bind_rows(
    category_gender.df %>% 
     filter(dimension %in% c("meth","scope")) %>% 
      select(category,pr_F) %>%
      pivot_wider(names_from=category,values_from=pr_F, values_fill=0) %>%
      mutate(subj="")
  ) %>%
  gt::gt() %>%
  gt::tab_header("Female proportion for each subject by method and scope") %>%
  gt::cols_label(subj="") %>%
  gt::cols_hide(c("theoretical")) %>%
  gt::tab_spanner(
    label = "methodology",
    columns = c(
      "empirical","longitudinal","qual","quant","theory"
    )
  ) %>%
  gt::tab_spanner(
    label = "scope",
    columns = c(
      "us","world",
    )
  ) %>%
  gt::cols_label(subject_total="(N)") %>%
  gt::cols_label(subject_gender="(subject mean)") %>%
  gt::fmt_number(decimals=2) %>%
  gt::fmt_number(columns=subject_total,decimals=0) %>%
  gt::sub_missing() %>%
  gt::data_color(
     columns = !c("subj","subject_total"),
     direction = "row",
     method="numeric",
     domain=c(0,1),
     palette = "RdBu",
     na_color = "white"
   ) %>% 
  gt::tab_row_group(
    label = "",
    rows = subj !="",
  ) %>%
  gt::tab_row_group(
    label = "Category Means",
    rows = subj =="",
  ) %>%
  gt::row_group_order(groups = c("", "Category Means")) %>%
  gt::tab_footnote("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")
Female proportion for each subject by method and scope
(N) (subject mean) methodology scope
empirical longitudinal qual quant theory us world
class 450 0.63 0.64 0.54 0.67 0.57 0.59 0.61 0.56
crim 419 0.59 0.61 0.49 0.65 0.56 0.59 0.57 0.49
econ 868 0.58 0.59 0.51 0.61 0.57 0.53 0.58 0.51
education 558 0.63 0.64 0.55 0.67 0.62 0.58 0.61 0.59
environment 317 0.54 0.55 0.57 0.55 0.53 0.46 0.53 0.55
ethnicity 290 0.59 0.60 0.69 0.62 0.59 0.59 0.62 0.52
family 606 0.66 0.67 0.58 0.70 0.62 0.61 0.64 0.64
gender 563 0.72 0.73 0.70 0.77 0.70 0.72 0.73 0.70
health 646 0.63 0.63 0.61 0.68 0.60 0.59 0.62 0.59
identity 413 0.61 0.62 0.62 0.63 0.60 0.61 0.60 0.51
immigration 271 0.60 0.61 0.61 0.64 0.56 0.57 0.57 0.55
inequality 1,081 0.62 0.63 0.54 0.65 0.62 0.59 0.60 0.54
lifecourse 202 0.64 0.66 0.62 0.69 0.62 0.56 0.62 0.74
media 238 0.54 0.55 0.42 0.60 0.44 0.49 0.55 0.49
movements 263 0.55 0.57 0.56 0.58 0.54 0.47 0.56 0.45
networks 257 0.51 0.54 0.53 0.55 0.51 0.47 0.49 0.41
orgs 563 0.59 0.60 0.42 0.63 0.56 0.52 0.58 0.49
politics 642 0.54 0.57 0.46 0.58 0.52 0.51 0.51 0.49
race 701 0.60 0.61 0.60 0.63 0.61 0.56 0.59 0.54
religion 118 0.48 0.49 0.15 0.55 0.43 0.47 0.48 0.39
rural 100 0.61 0.62 0.46 0.63 0.52 0.60 0.60 0.57
sexuality 185 0.65 0.65 0.66 0.66 0.66 0.65 0.65 0.52
stigma 90 0.63 0.61 0.33 0.64 0.59 0.62 0.65 0.65
urban 280 0.52 0.52 0.53 0.53 0.52 0.43 0.49 0.45
violence 202 0.67 0.69 0.65 0.72 0.63 0.68 0.65 0.61
youth 178 0.63 0.61 0.52 0.72 0.55 0.63 0.56 0.72
Category Means
0.58 0.53 0.61 0.56 0.54 0.56 0.51
Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.
Code
library(ggalluvial)

coded_isbns_wide.df %>%
  count(`subj:econ`,`meth:qual`,`claim:strong`) %>%
   mutate(`subj:econ`=as.factor(`subj:econ`) 
          %>% fct_recode( econ="TRUE",other_subj="FALSE"),
          `meth:qual`=as.factor(`meth:qual`)  %>%
            fct_recode( qualitative="TRUE",other_meth="FALSE"),
         `claim`=as.factor(`claim:strong`)  %>%
           fct_recode( strong="TRUE",not_strong="FALSE")
          ) %>%
  as.data.frame() %>%
  ggplot(aes(
    y=n,
    axis1=`subj:econ`,
    axis2=`meth:qual`,
  )) +
  geom_alluvium(aes(fill = claim)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("econ", "qualitative"))

Code
vcd::mosaic( ~ `subj:econ` + `meth:qual` +`claim:strong`, data = coded_isbns_wide.df, shade = FALSE)