Sociology Abstracts

Setup

Load data from proquest on 2020-2024-02-01 sociology department dissertations in English. Clean and tokenise as 2-skip-1 ngrams

Code

if (!file.exists("data/coded_abstracts.xlsx") || 
    !file.exists("data/ngram_seeds.xlsx") || TRUE
    ) {
requireNamespace("googledrive")
folderid <- googledrive::as_id("https://drive.google.com/drive/folders/1eFat4mCoRZX22gPLTj9w5Bn63U3q71yo")

filels <- googledrive::drive_ls(folderid)

googledrive::drive_download(
  file = filels %>% filter(name=="abstracts_hand_coding") %>% pull(`id`),
  path = "data/coded_abstracts",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="socabs_topic_seeds") %>% pull(`id`),
  path = "data/ngram_seeds",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="stems_and_stops") %>% pull(`id`),
  path = "data/stems_and_stops",
  type= "xlsx",
  overwrite=TRUE)

 rm(folderid, filels)
}

Loading required namespace: googledrive

! Using an auto-discovered, cached token.

  To suppress this message, modify your code or options to clearly consent to
  the use of a cached token.

  See gargle's "Non-interactive auth" vignette for more details:

  <https://gargle.r-lib.org/articles/non-interactive-auth.html>

ℹ The googledrive package is using a cached token for 'micah.altman@gmail.com'.

Auto-refreshing stale OAuth token.

File downloaded:

• 'abstracts_hand_coding' <id: 1FO5AjIpe0iIgLgTaHBK8b7OLUR29JY9Z1aGxOWZkJ9U>

Saved locally as:

• 'data/coded_abstracts.xlsx'

File downloaded:

• 'socabs_topic_seeds' <id: 1l9I2kvMUFkVpmcMfm2pfcXCyU18fX4En5ptfu9qu8zo>

Saved locally as:

• 'data/ngram_seeds.xlsx'

File downloaded:

• 'stems_and_stops' <id: 1sp16X7rED97CXjHFloUOOg10A2mb3VaH3Dyfxjp-dv0>

Saved locally as:

• 'data/stems_and_stops.xlsx'

Code

coded_abs.df <- readxl::read_excel("data/coded_abstracts.xlsx")

Code

# retrieve and merge abstracts data
rawdir <- "data/raw"
mergedfile <- "data/diss_meta.RDS"

retrieve_raw_data <- function(dest) {
  requireNamespace("googledrive")
  folderid <- 'https://drive.google.com/drive/folders/1BFMQkwj24zwH3uTIFmrCnp32dkfY807E'
  
  filels <- googledrive::drive_ls(folderid)
 
  purrr::pwalk(filels, function(name,id,...) {
    googledrive::drive_download(
       file = id,
       path = file.path(dest,name),
      overwrite=TRUE)
    })
}

merge_raw_data<-function(dest) { 
   xls.ls <- dir(destdir,pattern="\\.xls", full.names = TRUE)
   ris.ls <- dir(destdir,pattern="\\.ris", full.names = TRUE)
   
   merged_xls.df <- purrr::map(xls.ls, readxl::read_excel) %>% 
     purrr::list_rbind()
   
   
   merged_ris.df <- purrr::map(ris.ls, revtools::read_bibliography) %>% 
     purrr::list_rbind()
   
   merged.df <- merged_xls.df %>% 
     left_join( merged_ris.df %>% select(advisor=author,StoreId=accession,
                      organization=publisher, place = "pubplace"),
                by = "StoreId" )
   
   merged.df
}

if (!file.exists(mergedfile)) {
  if (length(dir(destdir, pattern = "\\.ris$"))==0) {
      retrieve_raw_data(destdir)
  }
  diss_meta_full.df <- merge_raw_data(destdir)  
  saveRDS(diss_meta_full.df, file=mergedfile)
} 

diss_meta_full.df <- readRDS(mergedfile)

Code

abs1.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.xls")
abs2.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.2.xls")
diss_meta.df <- bind_rows(abs1.df, abs2.df)
rm("abs1.df","abs2.df")

### text cleaning

clean_text<-function(x) {
  x %>%  
    stringr::str_replace_all("U\\.S\\.","USA") %>% 
    stringr::str_to_lower() %>% 
    stringr::str_replace_all("[-\\)\\(\\&\"/]"," ") %>%
    stringr::str_replace_all("[^a-zA-Z \\.']", "")  %>% 
    stringr::str_squish()
}


# NOTE:
# Columns in source overal with each other -- only selected columns used
#  - all duplicates: classification, subjectClassifications, classiifcationCOdes, majorClassificationsCodes
# - subjectTerms appears to be an automated coarse recoding of the classification
# - appears author assigned, duplicate columns: identifierKeywords, subjects
#   apparently post-processed to add "GenderWatch" and "y" tags 

diss_cleaned.df <- diss_meta.df %>% 
  select(isbn, Authors, classification, subjectTerms, pubdate,
         Abstract, Title, identifierKeywords) %>%
  mutate(classification = replace_na(classification,""),
         subjectTerms = replace_na(subjectTerms,""),
         identifierKeywords = replace_na(identifierKeywords,""),
         pubyear_clean = year(as_date(pubdate,format="%Y"))
         ) %>%
  rowwise() %>%
  mutate(classification_clean = 
          str_split_1(classification ,pattern = ",") %>% 
          str_squish() %>%
          str_replace("^[0-9]+ ","") %>%
          clean_text()  %>%
          unique() %>%
          list() ,
         subject_terms_clean = str_split_1(subjectTerms,pattern = ",") %>%
           clean_text() %>% 
           unique() %>%
           list(), 
         au_identifier_terms_clean =
           str_split_1(identifierKeywords,pattern = ",") %>%
           clean_text() %>% 
           unique() %>%
           list(),
        abstract_clean = 
          clean_text(Abstract),
        title_clean =
          clean_text(Title)
        
  ) %>% 
    ungroup() %>%
    select(isbn, Authors, classification_clean, subject_terms_clean, pubyear_clean,
           Abstract,Title,identifierKeywords, au_identifier_terms_clean, title_clean, abstract_clean)

diss_cleaned.df %>%
  count(pubyear_clean) %>%
  rename(year=pubyear_clean)-> dy.df
dy.ls <- dy.df %>% pull(n)
names(dy.ls) <- dy.df %>% pull(year)

rm(diss_meta.df)

Author Characteristics

Code

requireNamespace("opengender")

Loading required namespace: opengender

Code

diss_cleaned_plus_gender.df <- diss_cleaned.df %>% 
  mutate(given=  str_split_i(Authors,pattern=",",2)
         %>% str_squish() %>% 
           str_split_i(pattern="[:space:]",1)) 

diss_cleaned_plus_gender.df %<>% opengender::add_gender_predictions(dict= "wgen2")

diss_cleaned_plus_gender.df %>% 
  summarize(opengender::gender_mean(og_details,simplify_output="row"))

# A tibble: 1 × 3
  prop_F prop_M   prop_O
   <dbl>  <dbl>    <dbl>
1  0.573  0.427 8.76e-19

Text prep

Code

# subset of the snowball stopwords

minimal_stopwords <-
  c("a",  "am", "an", "and", "any", "are", "aren't", "as", "at", "be","but", "by",  "did", "didn't", 
"do", "does", "doesn't", "doing", "don't", "down", "during", 
"each", "for", "from", "further", "had", "hadn't", "has", 
"hasn't", "have", "haven't", "having", "how", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "my", "myself",  "of", "on",  "or", "other",  "so", "than", "that", "that's", "the","their", "they", "them", "then", "there", "there's", "these", "this", "how", "to", "too",  "was", "wasn't", "when", "when's", "where", "where's", "which", "while", "will", "with", "won't")

combined_tidy.df <-  diss_cleaned.df %>%
  select(isbn,abstract_clean,title_clean,au_identifier_terms_clean) %>%
  unite(col="Clean_combined", sep=" ", remove=TRUE, na.rm=TRUE,
        abstract_clean,title_clean,au_identifier_terms_clean) %>%
  unnest_tokens(ngram, "Clean_combined", token = "skip_ngrams", n = 2,k=1,
                stopwords=minimal_stopwords)  %>%
  mutate(ngram = str_squish(ngram)) %>% 
  filter(str_length(ngram)>1)

#%>%
#  filter(!str_detect(ngram,"\\.")) %>%
#  filter(ngram!="")

Code

if (FALSE) {
  combined_tidy.df %<>%
     mutate(ngram = textstem::stem_strings(ngram, language="english"))
}

stop_post.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "stopwords")

stem_raw.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "groupwords")

stem_post.df <- stem_raw.df %>%
  dplyr::rowwise() %>%
  mutate( root = stringr::str_split_1(`group`, pattern="; ")[[1]],
          stem = stringr::str_split(`group`, pattern="; ")) %>%
  dplyr::ungroup() %>%
  tidyr::unnest(stem) %>%
  dplyr::filter(root!=stem) %>% 
  select(!group)
  
stem_string <- function(x,delim=" ") {
  longword.df <- stringr::str_split(x,delim)[[1]] %>%
    as_tibble()
  
  longword.df %<>% left_join(stem_post.df, by=c(value="stem"))
  longword.df %<>% mutate(result=coalesce(root,value))
  longword.df %>% pull(result) %>% paste(collapse=delim)
}

stem_reg<-function(x, pattern, delim="; ") {
  pattern_vec=str_split(pattern,delim)[[1]]
  replace_target = paste0(" ",pattern_vec[1]," ")
  replace_pattern = paste0("((^| )",pattern_vec[2:length(pattern_vec)],"($| ))",collapse="|")
  str_replace_all(x,pattern=replace_pattern,replacement=replace_target) %>%
  str_squish()
}

# 10000x faster than join version
stem_string_reg <- function(x) {
  
  reglist <- stem_raw.df %>% pull(group)
  cum <-x
  for (g in reglist) {
     cum <-  stem_reg(cum,g)
  }
  cum
}

#stem_string_reg(c("racial racialize race baiting","foo race", "bar gendered"))

stem_string_V<- Vectorize(stem_string)

combined_tidy_stem.df <- combined_tidy.df %>%
  mutate(ngram=stem_string_reg(ngram)) %>%
  ungroup()

Code

combined_tf_idf_stemmed.df <- 
  combined_tidy.df %>% 
  count(ngram,isbn) %>%
    bind_tf_idf(ngram, isbn, n)

combined_tf_idf_stemmed.df  %<>%
  anti_join(stop_post.df, by="ngram")

Distributions of topics (unclustered)

Dissertations by Controlled Classifications

Code

diss_cleaned.df %>% 
  rename(term=classification_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term)) -> diss_class_tidy.df

ndis <- sum(dy.ls)
lower_q <- .05
upper_q <- .8

{diss_class_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation assigned to controlled classifications")} %>% plotly::ggplotly()

Code

{diss_class_tidy.df %>% 
  filter(pubyear_clean < 2024) %>%
  count(term, pubyear_clean, sort=TRUE) %>%
  rowwise() %>%
  mutate(p=n/ndis,
         p_year=n/dy.ls[[as.character(pubyear_clean)]]) %>%
  filter(p_year >=  lower_q ,
         p_year <=  upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p_year))+
  geom_col() +
  coord_flip() +
  labs(x="%age dissertations in assigned  classification over time") +
  facet_wrap(vars(pubyear_clean))} %>% plotly::ggplotly()

Code

rm(diss_class_tidy.df, lower_q, upper_q)

Dissertations by Author-Assigned Topics

Code

stop_topics.df <- tibble(term=c("y","genderwatch"))

diss_cleaned.df %>% 
  rename(term=au_identifier_terms_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term))  %>%
  anti_join(stop_topics.df,by="term") %>%
  distinct() -> diss_au_id_tidy.df

lower_q <- .025
upper_q <- 1

{diss_au_id_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation by author-assigned keywords")} %>% plotly::ggplotly()

Code

rm(diss_au_id_tidy.df, stop_topics.df)

Dissertations by Terminology

Code

unc_doc_freq.df <-
  combined_tf_idf_stemmed.df %>% 
  count(ngram) %>%
  mutate(p=n/sum(dy.ls))

lower_q <- .25 
upper_q <- .75

unc_doc_freq.df  %>% 
      slice_max(order_by=n, n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="terms appearing in most dissertations - excluding stopwords")

Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
Some words could not fit on page. They have been removed.

Code

{unc_doc_freq.df %>% 
  mutate(p = n/sum(dy.ls),
         ngram=fct_reorder(ngram,p)) %>%
  filter(p>= lower_q,
         p<= upper_q) %>%
  ggplot(aes(x=ngram,y=p))+
  geom_col() +
  coord_flip() +
  labs(x="terms appearing in [25%-75%] of dissertations (excluding stopwords, min 1%)")} %>% plotly::ggplotly()

Code

unc_doc_freq.df  %>% 
    slice_max(order_by=n, n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="ngrams appearing in most dissertations"
    )

Code

rm(unc_doc_freq.df, lower_q, upper_q)

Distribution of Terminology Across Corpus

Code

unc_ngram_freq.df <- 
  combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarise(n=sum(n), .groups="drop") %>%
  arrange(n)

unc_ngram_freq.df  %>% 
      slice_max(order_by = n,n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="most popular uncontrolled terms - excluding stop words")

Code

unc_ngram_freq.df  %>% 
    slice_max(order_by = n,n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most frequent 1 and 2 word terms in uncontrolled description"
    )

Code

rm(unc_ngram_freq.df)

Most Distinctive Terms in Each Dissertation

Code

nterms <- 5
ndiss <- 100

topterms.df <-
  combined_tf_idf_stemmed.df %>% 
  group_by(isbn) %>%
  slice_max(order_by=tf_idf, n=nterms) 

topdis.df <-
  topterms.df %>%
  group_by(isbn) %>%
  summarize(mean_tf_idf = mean(tf_idf)) %>%
  ungroup() %>%
  slice_max(order_by=mean_tf_idf, n= ndiss)
  
distinctive_diss.df <-
  left_join(topdis.df %>% select(isbn),
            topterms.df %>% select(isbn, ngram),
            by = "isbn") %>% 
  group_by(isbn) %>%
  summarize(distinct_terms = paste(ngram, sep =" ", collapse=", ")) %>% 
  ungroup() %>%
  left_join(diss_cleaned.df %>% select(Title,isbn), by="isbn")

distinctive_diss.df %>%
  relocate(Title, distinct_terms) %>%
  select(-isbn) %>%
  DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most distinctive terms in dissertations with distinctive terms"
    )

Code

rm(nterms,ndiss, topterms.df, topdis.df, distinctive_diss.df)

Topic Coding

Hand Coded Abstracts

Code

coded_isbns.df <- NULL

Code

hand_codes_tmp.df <-
  coded_abs.df %>% select(isbn,contains(':'),EpiStyle_type) %>% 
  filter( if_any(everything(), ~ !is.na(.x)) )

hand_codes_tmp.df %<>% 
  mutate(EpiStyle_type = str_to_lower(EpiStyle_type)) %>%
  #select(isbn,EpiStyle_type) %>%
  pivot_wider(names_from=EpiStyle_type, values_from=EpiStyle_type,
              names_prefix="epi:") %>% 
  select(-"epi:NA") %>%
  mutate(across(!isbn, 
                  ~ case_when(is.na(.x) ~ FALSE,  .default = TRUE)))

hand_codes_tmp.df %<>%
  pivot_longer(!isbn) %>%
  rename(topic=name) %>%
  mutate(coding_src="hand")

coded_isbns.df %<>% 
  bind_rows(hand_codes_tmp.df)
 
hand_code_tot.df <-
   hand_codes_tmp.df %>% 
    select(-isbn,coding_src) %>%
    filter(value) %>%
    count(topic,value) %>%
    select(-value) %>%
    separate_wider_delim(topic,
                         names=c("dimension","category"),delim=":")

{hand_code_tot.df %>%
    mutate(dimension = as.factor(dimension),
           category=as.factor(category)) %>%
    ggplot(aes(y=n,x=dimension,  fill=category,label=category)) +
    geom_col(position=position_dodge2()) +
    geom_text(position=position_dodge2(width=1), vjust=1) +
    theme(legend.position="none") + 
    facet_wrap(vars(dimension), scales="free_x")} %>%
  plotly::ggplotly()

Code

rm(hand_codes_tmp.df, hand_code_tot.df)

Exact Match to Coded Topics Terms

Code

seeds_from_codes.df <- coded_abs.df %>% 
  select(contains(':')) %>% 
  pivot_longer(cols=everything(), names_to="topic", values_to = "ngram") %>% 
  filter(!is.na(ngram) & ngram != 'x') %>%
  separate_longer_delim(ngram,';') %>%
  mutate(ngram=str_squish(ngram)) %>%
  filter(ngram!='')

seeds_from_ngrams.df <- 
  suppressMessages( readxl::read_excel("data/ngram_seeds.xlsx") )%>%
  rename(topic=Topic,ngram=Ngram) %>% select(topic,ngram)

seeds_from_ngrams.df %<>% 
  separate_longer_delim(topic,";") %>% 
  filter(!is.na(`topic`) & str_detect(`topic`,":")) %>%
  mutate(topic = str_squish(`topic`))  %>%
  filter(topic!="")

seeds.df <- bind_rows(seeds_from_codes.df, seeds_from_ngrams.df) %>% 
  mutate(ngram = str_to_lower(ngram) %>% 
           str_replace_all(pattern="-"," ") %>%
           str_squish()
           ) %>%
  distinct()

rm(seeds_from_codes.df, seeds_from_ngrams.df)

Code

exact_matches.df <- 
  left_join(seeds.df,
            combined_tf_idf_stemmed.df %>% distinct(),
            relationship="many-to-many",
            by = "ngram"
            )

exact_matches.df %>% 
  select(topic,ngram,isbn) %>%
  distinct() %>%
  group_by(topic,ngram) %>%
  summarize(n=n(), per_dis=n()/sum(dy.ls), .groups="drop") %>%
  arrange(desc(per_dis)) %>% 
  slice_head(n=15) %>%
  gt::gt() %>%
  gt::tab_header(title="most frequently matched topic terms")

most frequently matched topic terms
topic	ngram	n	per_dis
meth:empirical	interviews	686	0.3664530
meth:qual	interviews	686	0.3664530
claim:desc	examine	542	0.2895299
claim:desc	examines	471	0.2516026
subj:health	health	455	0.2430556
subj:gender	gender	438	0.2339744
scope:us	united states	435	0.2323718
subj:econ	economic	405	0.2163462
subj:race	race	404	0.2158120
meth:theory	theory	377	0.2013889
subj:race	racial	345	0.1842949
meth:qual	qualitative	343	0.1832265
subj:education	education	328	0.1752137
subj:inequality	inequality	321	0.1714744
claim:desc	explore	318	0.1698718

Code

nmatched_dis <- exact_matches.df %>% pull(isbn) %>% unique() %>% length

exact_matches.df %<>% 
  select(topic,isbn) %>% 
  distinct() %>%
  mutate(value=TRUE, coding_src="exact")

exact_matches.df %>%
  count(topic) %>%
  ungroup() %>%
  separate_wider_delim(topic,names=c("dimension","category"),delim=":") -> topic_sum.df 

topic_sum.df %<>% 
  group_by(dimension) %>%
  mutate(p=n/sum(dy.ls)) %>%
  ungroup()

lower_q <- .15 
coded_isbns.df %<>% 
  bind_rows(exact_matches.df)

{topic_sum.df%>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms")
  } %>% plotly::ggplotly()

Code

{topic_sum.df%>%
  filter(p>lower_q) %>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms, excluding rare topics")
  } %>% plotly::ggplotly()

Code

topic_sum.df %>% 
  group_by(dimension) %>%
  gt::gt() %>%
  gt::fmt_percent(columns="p")

category	n	p
claim
desc	1538	82.16%
mod	778	41.56%
strong	321	17.15%
weak	292	15.60%
epi
activist	108	5.77%
constructive	31	1.66%
pos	30	1.60%
util	92	4.91%
meth
empirical	1599	85.42%
longitudinal	218	11.65%
qual	1104	58.97%
quant	845	45.14%
theory	668	35.68%
scope
us	1069	57.10%
world	373	19.93%
subj
class	450	24.04%
crim	418	22.33%
econ	868	46.37%
education	558	29.81%
environment	317	16.93%
ethnicity	290	15.49%
family	604	32.26%
gender	561	29.97%
health	646	34.51%
identity	388	20.73%
immigration	271	14.48%
inequality	1080	57.69%
lifecourse	202	10.79%
media	238	12.71%
movements	263	14.05%
networks	257	13.73%
orgs	557	29.75%
politics	607	32.43%
race	700	37.39%
religion	118	6.30%
rural	100	5.34%
sexuality	185	9.88%
stigma	90	4.81%
urban	280	14.96%
violence	202	10.79%
youth	178	9.51%

Code

rm(lower_q,exact_matches.df)

Data Source Matches

Determine geographical scope by implication from referenced data source.

Code

dataset_codes.df <- readxl::read_excel("./data/stems_and_stops.xlsx",
                                       sheet = "datasets") %>%
      mutate(phrase_clean=clean_text(phrase))

dataset_codes_ls.df <-
  dataset_codes.df %>%
  group_by(topic) %>%
  summarise(phrase_list=list(phrase_clean), .groups="drop") %>%
  rowwise() %>% 
 # mutate(reg=paste0("((^| )",phrase_list,"( |$))",collapse="|")) %>% 
 # much slower -- in this case, we can assume that dataset names don't come
 # at very beginning of end
  mutate(reg=paste0("( ",phrase_list," )",collapse="|")) %>% 
  ungroup()

data_matches.df<- purrr::pmap(dataset_codes_ls.df,
    function(topic,reg,...) {
              diss_cleaned_plus_gender.df %>% 
                filter(stringr::str_detect(abstract_clean, reg)) %>% 
                select(isbn) %>% 
                mutate(topic=topic)
    } 
) %>% purrr::list_rbind() %>%
  distinct() %>% 
  mutate(coding_src="data")

coded_isbns.df %<>% 
  bind_rows(data_matches.df)

data_matches.df %>% 
  count(topic) %>%
  gt::gt()

topic	n
scope:us	156
scope:world	7

Topic Models

Code

# tidy helper for keyATM
tidy.keyATM_output<-function(x, matrix="phi", long=FALSE) {
  if (matrix=="phi") {
    res  <- x[["phi"]] %>%
        tibble::as_tibble(rownames=NA) %>%
        tibble::rownames_to_column("topic") 
    if(long) {
      res <-  res %>% 
        tidyr::pivot_longer(!topic) %>%
        dplyr::mutate(term=paste("phi",topic,name,sep="+"),param="phi" ) %>%
        dplyr::relocate(term,estimate=value,topic,word=name)
    }
  } else {
    res <- lda_key[["theta"]]  %>%
        tibble::as_tibble(rownames=NA) %>%
        tibble::rownames_to_column("document")
        if(long) {
    res <-  res %>% tidyr::pivot_longer(!document) %>%
        dplyr::mutate(term=paste("theta",document,name,sep="+"),param="theta" ) %>%
        dplyr::relocate(term,estimate=value,document,topic=name)
    }
  }
  res
}

Code

requireNamespace("quanteda")

Loading required namespace: quanteda

Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated

Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"pcorMatrix" of class "replValueSp"; definition not updated

Code

requireNamespace("keyATM")

Loading required namespace: keyATM

Code

topic_lower_q <- .15
unseeded_topics <- 3
seed_prior_strength <- .95
n_subj_categories <- 12
lower_q <-  topic_lower_q
upper_q <- .9
exclude_dim <- c("claim")

#stemming and grouping

combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarize(n=n()) %>%
  mutate(p=n/sum(dy.ls)) -> ngram_diss_ct.df

core_ngrams.df  <-
  ngram_diss_ct.df %>%
  filter( p >= lower_q, p<= upper_q) %>%
  select("ngram") %>%
  bind_rows(seeds.df %>% select(ngram))  %>%
  distinct() %>%
  left_join(combined_tf_idf_stemmed.df %>% select(ngram,isbn,n)) %>%
  na.omit() %>%
  distinct()

Joining with `by = join_by(ngram)`

Code

core_dfm <- 
  core_ngrams.df %>%
  rename(term=ngram, document=isbn, value=n) %>% 
  cast_dfm(document=document,term=term,value=value)

keyATM_docs <- keyATM::keyATM_read(texts = core_dfm, keep_docnames = TRUE)

ℹ Using quanteda dfm.

Code

index2isbn.df <- tibble(index=keyATM_docs[["doc_index"]], isbn=keyATM_docs[["docnames"]])

index2isbn.df %<>% left_join(diss_cleaned_plus_gender.df %>% select(isbn,Title), by="isbn")

excluded_topics<- topic_sum.df %>%
  filter(p<topic_lower_q) %>% 
  select(dimension,category) %>% 
  transmute(topic=paste(dimension,category,sep=":"))

excluded_areas <- topic_sum.df %>%
    filter(dimension %in% exclude_dim) %>%
      transmute(topic=paste(dimension,category,sep=":"))

excluded_subjects <- topic_sum.df %>%
 filter(dimension=="subj") %>% 
 arrange(desc(p)) %>% 
 slice_tail(n=-1*n_subj_categories ) %>%
 select(dimension,category) %>% 
 transmute(topic=paste(dimension,category,sep=":"))

seeded_topics.ls <- seeds.df %>% 
  anti_join(bind_rows(excluded_topics,excluded_subjects, excluded_areas), by ="topic") %>%
  group_by(`topic`) %>% 
  summarise(ngram_list=list(ngram)) %>% 
  pmap( function(topic,ngram_list) { x<-list(); x[[topic]] <- ngram_list; x} ) %>% 
  list_flatten() 

key_viz <- keyATM::visualize_keywords(docs = keyATM_docs, 
                                      keywords = seeded_topics.ls)

Warning: Keywords are pruned because they do not appear in the documents: in depth
interviews, semi structured interviews, american time use survey, u.s., centers
for disease control, american community survey, department of veterans affairs,
american time use survey, in asian countries, united states and europe, in
france, low income countries, middle income countries, high income countries,
world bank lending, with criminal, their school, their children, …, these
organizations, and their racial

Code

key_viz

Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 348 rows containing missing values or values outside the scale range
(`geom_label_repel()`).

Warning: ggrepel: 628 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Code

suppressMessages(
  lda_key <- keyATM::keyATM(
  docs              = keyATM_docs,    # text input
  no_keyword_topics = unseeded_topics,              # number of topics without keywords
  keywords          = seeded_topics.ls,       # keywords
  priors            = list(beta_s=seed_prior_strength),
  model             = "base",         # select the model
))

Warning: Keywords are pruned because they do not appear in the documents: in depth
interviews, semi structured interviews, american time use survey, u.s., centers
for disease control, american community survey, department of veterans affairs,
american time use survey, in asian countries, united states and europe, in
france, low income countries, middle income countries, high income countries,
world bank lending, with criminal, their school, their children, …, these
organizations, and their racial

Code

lda_key %>% keyATM::plot_modelfit()

Code

keyATM::semantic_coherence(lda_key, keyATM_docs, n = 10)  -> lda_key.sem
lda_key.sem %>% as_tibble(rownames="topic") %>% rename(coherence=value) %>% gt::gt()

topic	coherence
1_meth:empirical	-55.30294
2_meth:qual	-54.87866
3_meth:quant	-49.68228
4_meth:theory	-71.94672
5_scope:us	-52.64626
6_scope:world	-80.31968
7_subj:class	-70.60734
8_subj:crim	-68.31088
9_subj:econ	-52.12322
10_subj:education	-43.85082
11_subj:family	-66.52815
12_subj:gender	-56.89063
13_subj:health	-62.55667
14_subj:identity	-90.62521
15_subj:inequality	-88.46208
16_subj:orgs	-73.20833
17_subj:politics	-64.45310
18_subj:race	-49.35721
Other_1	-111.07606
Other_2	-107.55990
Other_3	-78.48870

Code

lda_key %>% keyATM::top_words() %>%
  gt::gt() %>% 
  gt::tab_header("Top words for each topic")

Top words for each topic
1_meth:empirical	2_meth:qual	3_meth:quant	4_meth:theory	5_scope:us	6_scope:world	7_subj:class	8_subj:crim	9_subj:econ	10_subj:education	11_subj:family	12_subj:gender	13_subj:health	14_subj:identity	15_subj:inequality	16_subj:orgs	17_subj:politics	18_subj:race	Other_1	Other_2	Other_3
data	interviews [✓]	data	network	states	environmental	class [✓]	police [✓]	economic [✓]	students [✓]	family [✓]	gender [✓]	health [✓]	identity [✓]	inequality [✓]	organizations [✓]	political	black [✓]	neighborhood	migration	immigrant
empirical [✓]	community	results	networks	united	world [✓]	workers [✓]	criminal [✓]	labor [✓]	school [✓]	children [✓]	women	mental [✓]	religious	urban	organizational [✓]	state	racial [✓]	media	migrants	immigrants
survey	cultural	factors	theory [✓]	united states [✓]	development	work	law [✓]	work	education [✓]	families [✓]	sexual	mental health [✓]	sexual	disparities [✓]	management [✓]	policy	race [✓]	social media	migrant	immigration
effects	practices	effects	status	state	national	middle class [✓]	justice [17]	workers [7]	college [✓]	parents [✓]	violence	medical [✓]	queer	discrimination [✓]	practices	movements	white [✓]	neighborhoods	latinx	status
examine	experiences	associated	support	policy	sustainability	labor [9]	incarceration [✓]	economy [✓]	educational [✓]	life	gendered [✓]	life	lgbtq	rural	work	politics [✓]	ethnic	change	violence	usa [5]
relationship	people	higher	social networks	american [✓]	international [✓]	worker [✓]	legal [✓]	employment [✓]	schools [✓]	adulthood [✓]	experiences	healthcare [✓]	religion	inequalities [✓]	public	public	racism [✓]	climate change	refugees	legal [8]
results	work	rates [✓]	relationships	new	nations [5]	race [18]	policing [✓]	job [✓]	higher	marriage [✓]	feminist [✓]	covid [✓]	stigma [15]	barriers [✓]	organization [✓]	justice [✓]	indigenous [✓]	collective	community	second generation
factors	new	support	online	public	transnational [✓]	working class [✓]	prison [✓]	labor market [✓]	student [✓]	relationships	transgender [✓]	pandemic [✓]	people	equity [✓]	activists	environmental	ethnicity	individuals	youth	migration
new	process	outcomes	role	local [✓]	cultural	women	punishment [✓]	occupational [✓]	high	childhood [✓]	men women [✓]	older adults	sexuality	gentrification [✓]	change	welfare [✓]	racialized [✓]	online	rural	united states [5]
outcomes	depth	survey	social network	political	human rights [17]	social class [✓]	violence	markets [✓]	higher education [✓]	work	gender inequality [✓]	patients [✓]	experiences	intersectional [✓]	movements	argue	black women [✓]	political	usa [5]	states

Code

lda_key %>% keyATM::top_docs(n=5)%>% 
  pivot_longer(cols=everything()) %>%
  rowwise() %>%
  left_join(index2isbn.df, by=c(value="index")) %>%
  ungroup() %>% 
  select(-value,isbn) %>%
  group_by(name) %>%
  gt::gt() %>%
  gt::tab_header("Top dissertations in each topic")

Top dissertations in each topic
isbn	Title
1_meth:empirical
9798380595100	Computational Social Science for Sociological Research
9798672178813	Rednecks and Hillbillies: A Thematic Analysis of the Construction of Pride and High Self-Esteem Exhibited by Southern Characters
9798380836685	Creativity on Demand: Cognition, Materiality, and Sociality in Improvisation
9798534691252	Roaring Tiger or Bounce Tigger? Examining Corruption Seriousness Perception in China from a Socio-Cultural Perspective
9798379570682	Three Papers on the Moral Perceptions of Scientific Misconduct
2_meth:qual
9798834058755	The Ghost in the Machine: Organ Transplantation and the Phenomenology of Embodiment
9798380135238	Narrative Power: Social Control in ISLs & Group Homes
9798381413083	Sociological Silhouettes: The Archaeology and Demography of Knowledge
9798357574244	Within the Waterline: An Exploratory Qualitative Study of How Social Interactions between Leadership Coaches and Their Young Professional Clients Facilitates Leader-Identity Transformation
9798538105540	Excavating Palimpsests in Ponte City
3_meth:quant
9798762198066	Hurricanes and Heart Problems: Natural Disasters, Social Capital, and Cardiovascular Mortality
9798819384725	The Influence of Demographic, Situational, Psychological, and Motivational Factors in Mass Murder Attacks
9798569993567	Recidivism and Crime Specialization Among Male Inmates in Chinese Prisons
9798672157955	Inter Vivos Transfers in Twenty European Countries (2004-2017)
9798802704905	Shareholder Value Minimization?: How Some US Corporations Avoid Institutional Pressures
4_meth:theory
9798845428967	Witchcraft and Partisanship in Contemporary Society: The Case of QAnon
9798569924295	Testing the Intrinsic Benefit Model of the Signaling Theory
9798379718763	Group Cognition: Reclaiming C.S. Peirce’s Specialized Theory of Cognition and Its Implications for Sociology and the Cognitive Sciences
9798841728948	Peircean Pragmatism, Critical Realism, and Cultural Sociology
9798641784465	A Social Network Analysis of Online Gamers' Friendship Networks: Structural Attributes of Steam Friendships, and Comparison of Offline-Online Social Ties of MMO Gamers
5_scope:us
9798357537522	Permanent Austerity: A 40 Year Case Study of Chicago Public Finance
9798834026358	National Carbon Prices
9798351421940	Punctuated Policy Innovation: The Transformation of U.S. Gendered Pay Inequity Policy Alternatives, 1945-2019
9798460416622	The Caduceus of Modernity: States, Pirates, and the Battle for Sovereignty
9798380620642	Marketing the Missing: Missing Persons and the Competition for Concern
6_scope:world
9798516927935	The Age of Protest: World-Historical Structure and Dynamics of Protest Waves in the Global South in the Long Twentieth Century
9798380590273	Individual Empowerment in Multiple Modernities: A Quantitative Exploration of the Relationship Between Economic Development, Culture, and Civil Liberties
9798841749431	Reproduction of Nationhood: Citizenship and Immigration Policy in Turkey
9798351436784	Agent Orange and the Treadmill of Destruction: Environmental Degradation and Inequality in Vietnam
9798379914578	The Global Path of Human Rights: A Quantitative Analysis of Trajectories, Democracy, and Development
7_subj:class
9798780656401	Solidarity Infrastructure: Gender and Race Solidarity and Cross-Class Coalitions in the Kansas City General Strike of 1918
9798380852579	Dallas: Kinship, Mobility, and Inheritance in an Elite Population, 1895-1945
9798380858113	<strong>WORK, CLASS AMBIGUITY, AND MULTIPLE FEMININITIES</strong> Women Beauty and Retail Workers in Pakistan’s New Service Economy
9798790667619	Constructing the Perfect Girlfriend: Gender, Class, Race, & Performativity of Paid Intimacy in Nevada Brothels
9798460487561	State, Street, Store: The Development of the Chinese Middle Class
8_subj:crim
9798460434251	The Ideology of the Carceral State: Examining the Prison through Film
9798834020844	Attitudes of Guangzhou Public toward Criminal Retrial: Predictors and Process from the Relationism Theory
9798358480247	Can Electronic Monitoring Fix Mass Incarceration?: Understanding the Role of Electronic Monitoring in Local Policy Reform Efforts
9781658492614	Analysis of the YLS/CMI Risk Assessment and Recidivism Prediction for Justice-Involved Youth
9798538120000	Breaking Out of Prison Culture? Or Culture Breaking Out of Prison?: Understanding Convict Code Endorsement during Incarceration and Post-Release
9_subj:econ
9798678131263	Bearing a Beloved Burden: Surrogates, Reproductive Labor, and Carrying Babies for Others
9798557097413	Bad Jobs or ‘Badass’ Jobs? Fishing Guides, Self-Employment, and Service Work in the New Economy
9798379575199	Changes in Occupational Structure and Labor Market Outcomes in China
9798492722500	Taylored Flexibility: Agile, Control, and the Software Labor Process
9798597058108	Government Regulation and Economic Globalization: The Diffusion of Antitrust Law and Merger Control and Its Influence on Cross-Border Mergers and Acquisitions
10_subj:education
9798834020516	Risk and Protective Factors in Adolescent Substance Use: Comparing Students of Regular High Schools and Vocational High Schools in China
9798535591438	Educational Trajectories of Indigenous Students: Vertical and Horizontal Stratification in the Chilean Educational System
9798380599337	Transition in Times of Transition: The Last Year of High School and the College-Going (Or Not-Going) Decision-Making Process of Senior High School Students in Chile
9798380595209	STEM Participation, Persistence, and Attrition Among First-Generation College Students
9798841795469	Encounters with Uncertainty: How Secondary Education Policies Shape Parental Involvement in Schooling in Contemporary Turkey
11_subj:family
9798357550293	The Consequences of Family Stress in the Work-Family Interface: Three Longitudinal Studies in Canada
9798802748862	Constructing Childhood through Social Interaction: Rights, Obligations, and Accountability in Adult-Child Interaction
9798379422578	Intergenerationality: The Making and Consequences of Parental Intervention in Children’s Marriage in China
9798678157829	Dissolution Pathways: Mother-Child Relationship Quality, Adolescent Academic Well-Being, and College Completion Among Young Adults
9798380176071	Three Essays on Responsibility and the Transition to Adulthood in Comparative Perspective
12_subj:gender
9798438745518	The Rape Paradox: The Effect of Anti-Sexual Assault Policies on Gender and Sexual Assault Attitudes, Donations to Sexual Assault Campaigns, and Policy Compliance
9798841754534	Gender Knowledge: Category, Status, Transgression, Policing, and Perception
9798845426468	Sexual Interaction in Action: Consent and Agency in the Sexual Communications of Early Adolescents and Adults
9798379586041	“Skanks Need to Pay with their Lives”: Online Harassment, Cyberstalking, and Violence Against Women Online
9798662468405	Reproducing Gender Inequality in Gaming: A Dual Analysis of the Gaming Hierarchy and Gamer Identity
13_subj:health
9798379620769	Surveillance Medicine in Perinatal Care: Negotiating Constraints, Constructing Risk, and the Elusive Goal of Mental Health Integration
9798379946050	Exploring Multiple Stakeholder Experiences of Healthcare: Opportunities for Intervention and Change to Improve Care and Outcomes for People with Mental Illness
9798819388891	Exploring Differential Exposure to Adverse Social Determinants of Health for Children with Developmental Disabilities and Their Related Health Outcomes and Resiliency
9798790654770	Educational Disparities in Chronic Pain and Life Expectancy: Gaps and Pathways
9798460476299	Multidimensional Health: Applications Across Cultures, Cohorts, and the COVID-19 Pandemic
14_subj:identity
9798607337148	Speaking Through the Silence: Narratives, Interaction, and the Construction of Sexual Selves
9798358499164	Institutionalized Normative Heterosexuality: The Case of Sexual Fluidity
9798664798906	Out in Front: How LGBQ Women and Genderqueer Clergy Navigate Religious Communities
9798582505327	Behind the Laughter: Health, Stigma and Communal Coping Among Standup Comedians
9798780637288	LGBTQ YouTube: Community and Branding through New Media
15_subj:inequality
9798534659009	Income Inequality and Caste in India: Evidence from India Human Development Surveys
9798662382343	The Social Etiology of Prescription Psychotherapeutic Misuse among Rural and Urban Adolescents and Emerging Adults: Concurrent and Prospective Stress-Process Analyses
9798374405644	The Rent Is Too Damn High: The Spatial and Longitudinal Dimensions of Housing Affordability
9798738630620	The Measurement and Social Consequences of U.S. Income Inequality
9798759981954	Urban-Rural Differences in Lifespan Variation in the United States
16_subj:orgs
9798678120151	Laissez Fairy Tales: Consensus, Cohesion, and Corporate Culture During the Collapse of Lehman Brothers
9798534666571	From Angry Militants to Happy Vegans: Animal Rights Activists' Recasting for Vegan Recruitment
9781658487306	Reinventing Diversity, Activists Interfering with the Managerialization of the Law
9798460471942	Fashioning Religious Selves in a Secular Age
9798834058038	Welcome Back a Yard: Jamaican Imaginings of the Diasporan Homebuyer
17_subj:politics
9798380091572	Erasing Violence: Political Contests over the Recognition of the Great Ukrainian Famine of 1932-1933
9798381107340	Unsettling Science: How Activists Deployed Science in the Conversion Therapy Debate
9798662595316	Militarism, Democracy, and Concordance: The Role of Citizenry in (Re)-Establishing Democratic Order in Argentina and Turkey
9798351456461	the Fraternal Twins: A Comparative Study of Hegemony Building, Political Mobilization and Crony Capitalism in Neo-Authoritarian Turkey and Russia (1991–2021)
9798841795353	Mobilizing Consumers: The American Consumer Movement in the 1960s–70s as a Social Movement
18_subj:race
9798352940266	Understanding “Our” Similarities and Differences Academically, Socially, and Psychologically: The Race-Gendered Experiences of Black Men and White Men Enrolled in STEM Doctoral Programs
9798662409545	The Macro and Micro Foundations of Racial Residential Segregation: A Contemporary Analysis
9798672129129	Social Stratification in the Mortgage Market Post the Great Recession
9798845451798	Internalizing Achievement Inequality: The Development of Racial/Ethnic Differences in Mathematics Attitudes and Their Implications for Persistence in STEM
9798351468013	Backdoor to Essentialism? Genetic Ancestry Testing and the Social Deconstruction of Whiteness
Other_1
9781392539118	Three Papers on Collective Action
9798780611493	Understanding Climate Change Discourse in China
9798480642933	How Social Media Affects Political Action: The Effects of Digital Network Structures and Motivations on Movement Participation
9798460436040	“Acts of Pure Evil”: The Portrayal of Mass Shooting Events on Online Media Platforms
9798380858816	Autism in Perpetual Motion: A Methodology for Examining Social Media Data via the Case of Autistic Influencers of Twitter, 2018-2019
Other_2
9798438794738	Knowledge, Place, and Experience in the Migrant Journey: How Central American Migrant Youth Negotiate Violence in Mexico
9798845452276	The Role of Community Context Factors in explaining International Migrant Flows and their Composition: Three Studies based on the Mexico-U.S. Case
9798380897396	The Emergence of Ina-Ethe Migration : Mpondo Men and Continued Migrant Labour Post-Apartheid
9798837553493	Intersubjective Understanding of Violence: The Lifeworld of Tajik Immigrant Workers in Post-Soviet Russia
9798841770886	Spatial Disparity and Educational Sorting of U.S. Internal Migration
Other_3
9798535509976	Legal Status Fluidity: Theorizing Legal Status Transitions and How Filipino Immigrants Navigate Immigration Pathways
9798351434360	Bureaucracies of Removal: The Labor and Logics of US Immigration Courts
9798837524691	Social and Economic Consequences of “Get Tough” on Immigration
9798538103768	Measuring the Effect of Immigrant Legal Status on Socioeconomic Outcomes: Variation by Legal Status Assignment Approach
9798381168358	Barriers to Legal Status Attainment and the Outcomes of Immigrants and Their Children

Code

{lda_key %>% keyATM::plot_topicprop(
  n = 5,
  show_topic = NULL,
  show_topwords = TRUE,
  label_topic = NULL,
  order = "proportion")}[["figure"]] %>%
  plotly::ggplotly()

Code

# documents with n% of words on specific topic

tidy(lda_key, matrix="theta", long=TRUE) %>%
  arrange(desc(estimate)) %>% 
  filter(estimate > .2)  %>%
  count(topic) %>%
  gt::gt() %>%
  gt::tab_header("number of dissertations with => 20% words from given topic")

number of dissertations with => 20% words from given topic
topic	n
10_subj:education	181
11_subj:family	144
12_subj:gender	157
13_subj:health	257
14_subj:identity	85
15_subj:inequality	118
16_subj:orgs	76
17_subj:politics	257
18_subj:race	237
1_meth:empirical	528
2_meth:qual	792
3_meth:quant	183
4_meth:theory	104
5_scope:us	145
6_scope:world	57
7_subj:class	43
8_subj:crim	141
9_subj:econ	117
Other_1	48
Other_2	43
Other_3	51

Code

lda_matches.df <- tidy(lda_key, matrix="theta", long=TRUE) %>%
  arrange(desc(estimate)) %>% 
    filter(estimate > .2) %>%
    select(isbn=document, topic=topic) %>%
    mutate(topic=str_replace(topic,"\\d+_","")) %>%
    mutate(topic=str_replace(topic,"^Other","subj:other")) %>%
    distinct() %>%
    mutate(value=TRUE, coding_src="LDA")

coded_isbns.df %<>% filter(coding_src!="LDA")
coded_isbns.df %<>% bind_rows(lda_matches.df)

Bi-variate analysis

Code

coded_isbns_wide.df <- coded_isbns.df %>%
  select(-coding_src) %>%
  filter(value) %>%
  distinct()  %>% 
  pivot_wider(values_from=value, names_from=topic, values_fill=FALSE)

coded_isbns_wide_nest.df<- coded_isbns.df %>%
  filter(value) %>%
  select(isbn,topic) %>%
  distinct() %>%
  separate_wider_delim(topic,delim=":", names=c("dimension","category")) %>%
  pivot_wider(names_from="dimension", values_from=category,
              values_fn=list) %>% 
  left_join(diss_cleaned_plus_gender.df %>% select(isbn,og_pr_F), by="isbn") 
  
coded_isbns_wide_unpacked.df <-
  coded_isbns_wide_nest.df%>% select(isbn,subj,meth,scope,og_pr_F) %>%
  unnest(cols=c(subj), keep_empty =TRUE) %>%
  unnest(cols=c(meth), keep_empty =TRUE) %>%
  unnest(cols=c(scope) , keep_empty =TRUE) 

coded_isbns_gender.df <- 
  coded_isbns.df %>%
  ungroup() %>%
  filter(value) %>%
  select(!coding_src) %>%
  distinct() %>%
  left_join(diss_cleaned_plus_gender.df %>% select(isbn, og_pr_F), by="isbn") %>%
   separate_wider_delim(topic,names=c("dimension",
                                     "category"),delim=":")

coded_isbns.df %>%
  filter(value) %>%
  select(-coding_src) %>%
  distinct() %>%
  count(topic,sort=TRUE) %>%
  ungroup() %>%
  separate_wider_delim(topic,names=c("dimension",
                                     "category"),delim=":") ->   all_topic_sum.df 
all_topic_sum.df %<>% 
  group_by(dimension) %>%
  mutate(p=n/sum(dy.ls)) %>%
  ungroup()

Code

all_topic_sum.df

# A tibble: 47 × 4
   dimension category       n     p
   <chr>     <chr>      <int> <dbl>
 1 meth      empirical   1638 0.875
 2 claim     desc        1538 0.822
 3 meth      qual        1198 0.640
 4 subj      inequality  1081 0.577
 5 scope     us          1080 0.577
 6 subj      econ         868 0.464
 7 meth      quant        862 0.460
 8 claim     mod          778 0.416
 9 meth      theory       709 0.379
10 subj      race         701 0.374
# ℹ 37 more rows

Code

{all_topic_sum.df%>%
  filter(p>lower_q) %>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="ALL CODING METHODS: percent of dissertations in each category, excluding rare")
  } %>% plotly::ggplotly()

Code

coded_isbns_wide_unpacked.df %>%
  group_by(subj,meth,scope) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) %>%
  DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="subj x meth x scope frequencies"
    )

Code

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,meth) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> meth_subj_freq.df 

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,scope) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> scope_subj_freq.df 

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  summarize(subject_total = length(unique(isbn)))  -> subject_totals.df

meth_subj_freq.df %>%
  mutate(display=glue::glue('{round(percent_of_subject,2)} ({n})')) %>%
  select(subj,meth,display) %>%
  pivot_wider(names_from=meth,values_from=display, values_fill="0") %>%
  gt::gt() %>%
  gt::tab_header("proportion of each subject employing methodology") %>%
  gt::cols_label(subj="") %>%
    gt::tab_caption("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")

Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.
proportion of each subject employing methodology
	empirical	longitudinal	qual	quant	theoretical	theory	NA
class	0.9 (405)	0.14 (61)	0.69 (311)	0.45 (204)	0 (1)	0.35 (159)	0.03 (12)
crim	0.86 (360)	0.12 (49)	0.63 (265)	0.46 (194)	0	0.35 (148)	0.04 (15)
econ	0.89 (774)	0.13 (117)	0.64 (553)	0.48 (413)	0	0.38 (328)	0.03 (23)
education	0.88 (489)	0.14 (78)	0.6 (337)	0.55 (306)	0	0.39 (220)	0.03 (14)
environment	0.9 (285)	0.12 (37)	0.7 (221)	0.41 (130)	0	0.39 (123)	0.03 (8)
ethnicity	0.89 (259)	0.14 (40)	0.61 (178)	0.59 (171)	0	0.38 (109)	0.03 (9)
family	0.9 (546)	0.17 (103)	0.58 (349)	0.5 (304)	0	0.35 (211)	0.02 (14)
gender	0.9 (507)	0.1 (58)	0.64 (363)	0.5 (281)	0 (1)	0.39 (218)	0.03 (15)
health	0.9 (583)	0.15 (96)	0.59 (378)	0.56 (362)	0	0.39 (251)	0.02 (11)
identity	0.91 (375)	0.06 (25)	0.78 (324)	0.36 (147)	0 (1)	0.4 (167)	0.02 (9)
immigration	0.9 (245)	0.13 (36)	0.68 (185)	0.45 (122)	0	0.35 (95)	0.02 (6)
inequality	0.9 (969)	0.12 (127)	0.65 (698)	0.5 (537)	0	0.37 (397)	0.03 (28)
lifecourse	0.93 (187)	0.3 (61)	0.43 (87)	0.61 (123)	0	0.38 (76)	0.02 (4)
media	0.91 (216)	0.06 (14)	0.72 (172)	0.39 (94)	0	0.43 (102)	0.02 (5)
movements	0.86 (225)	0.07 (19)	0.79 (208)	0.34 (89)	0	0.41 (109)	0.03 (9)
networks	0.88 (226)	0.13 (34)	0.6 (154)	0.5 (129)	0	0.57 (146)	0.01 (2)
orgs	0.88 (497)	0.08 (46)	0.79 (443)	0.39 (222)	0	0.38 (213)	0.02 (10)
other_1	0.88 (42)	0.02 (1)	0.6 (29)	0.46 (22)	0	0.31 (15)	0.04 (2)
other_2	0.91 (39)	0.07 (3)	0.72 (31)	0.42 (18)	0	0.21 (9)	0.02 (1)
other_3	0.88 (45)	0.14 (7)	0.69 (35)	0.49 (25)	0	0.39 (20)	0.02 (1)
politics	0.85 (545)	0.11 (71)	0.71 (453)	0.38 (244)	0	0.37 (238)	0.04 (23)
race	0.88 (620)	0.11 (76)	0.66 (465)	0.49 (346)	0 (1)	0.38 (266)	0.03 (24)
religion	0.86 (102)	0.09 (11)	0.62 (73)	0.53 (63)	0	0.35 (41)	0.02 (2)
rural	0.88 (88)	0.12 (12)	0.6 (60)	0.48 (48)	0	0.34 (34)	0.05 (5)
sexuality	0.91 (169)	0.05 (10)	0.77 (143)	0.39 (73)	0.01 (1)	0.37 (68)	0.04 (7)
stigma	0.93 (84)	0.03 (3)	0.71 (64)	0.38 (34)	0	0.46 (41)	0.01 (1)
urban	0.87 (244)	0.14 (39)	0.64 (180)	0.5 (139)	0	0.35 (97)	0.03 (8)
violence	0.86 (174)	0.1 (20)	0.65 (132)	0.45 (90)	0	0.38 (76)	0.04 (9)
youth	0.94 (167)	0.29 (52)	0.54 (96)	0.54 (97)	0	0.35 (62)	0.01 (2)
NA	0.71 (5)	0	0.43 (3)	0.14 (1)	0	0.43 (3)	0

Code

meth_subj_freq.df %>%
  mutate(display=glue::glue('{round(pr_F,2)} ({n})')) %>%
  select(subj,meth,display) %>%
  pivot_wider(names_from=meth,values_from=display, values_fill="0") %>%
  gt::gt() %>%
  gt::tab_header("Female proportion by subject & method") %>%
  gt::cols_label(subj="") %>% 
  gt::tab_caption("Categories are overlapping. Number of dissertations in category in parentheses. ")

Categories are overlapping. Number of dissertations in category in parentheses.
Female proportion by subject & method
	empirical	longitudinal	qual	quant	theoretical	theory	NA
class	0.64 (405)	0.54 (61)	0.67 (311)	0.57 (204)	0.96 (1)	0.59 (159)	0.59 (12)
crim	0.61 (360)	0.49 (49)	0.65 (265)	0.56 (194)	0	0.59 (148)	0.47 (15)
econ	0.59 (774)	0.51 (117)	0.61 (553)	0.57 (413)	0	0.53 (328)	0.62 (23)
education	0.64 (489)	0.55 (78)	0.67 (337)	0.62 (306)	0	0.58 (220)	0.65 (14)
environment	0.55 (285)	0.57 (37)	0.55 (221)	0.53 (130)	0	0.46 (123)	0.42 (8)
ethnicity	0.6 (259)	0.69 (40)	0.62 (178)	0.59 (171)	0	0.59 (109)	0.67 (9)
family	0.67 (546)	0.58 (103)	0.7 (349)	0.62 (304)	0	0.61 (211)	0.49 (14)
gender	0.73 (507)	0.7 (58)	0.77 (363)	0.7 (281)	0.96 (1)	0.72 (218)	0.7 (15)
health	0.63 (583)	0.61 (96)	0.68 (378)	0.6 (362)	0	0.59 (251)	0.46 (11)
identity	0.62 (375)	0.62 (25)	0.63 (324)	0.6 (147)	0.96 (1)	0.61 (167)	0.77 (9)
immigration	0.61 (245)	0.61 (36)	0.64 (185)	0.56 (122)	0	0.57 (95)	0.67 (6)
inequality	0.63 (969)	0.54 (127)	0.65 (698)	0.62 (537)	0	0.59 (397)	0.64 (28)
lifecourse	0.66 (187)	0.62 (61)	0.69 (87)	0.62 (123)	0	0.56 (76)	0.5 (4)
media	0.55 (216)	0.42 (14)	0.6 (172)	0.44 (94)	0	0.49 (102)	0.71 (5)
movements	0.57 (225)	0.56 (19)	0.58 (208)	0.54 (89)	0	0.47 (109)	0.42 (9)
networks	0.54 (226)	0.53 (34)	0.55 (154)	0.51 (129)	0	0.47 (146)	0.48 (2)
orgs	0.6 (497)	0.42 (46)	0.63 (443)	0.56 (222)	0	0.52 (213)	0.47 (10)
other_1	0.48 (42)	NaN (1)	0.56 (29)	0.51 (22)	0	0.37 (15)	1 (2)
other_2	0.58 (39)	0.89 (3)	0.53 (31)	0.42 (18)	0	0.86 (9)	1 (1)
other_3	0.58 (45)	0.94 (7)	0.54 (35)	0.5 (25)	0	0.5 (20)	0 (1)
politics	0.57 (545)	0.46 (71)	0.58 (453)	0.52 (244)	0	0.51 (238)	0.53 (23)
race	0.61 (620)	0.6 (76)	0.63 (465)	0.61 (346)	0.96 (1)	0.56 (266)	0.61 (24)
religion	0.49 (102)	0.15 (11)	0.55 (73)	0.43 (63)	0	0.47 (41)	0.5 (2)
rural	0.62 (88)	0.46 (12)	0.63 (60)	0.52 (48)	0	0.6 (34)	0.67 (5)
sexuality	0.65 (169)	0.66 (10)	0.66 (143)	0.66 (73)	0.96 (1)	0.65 (68)	0.6 (7)
stigma	0.61 (84)	0.33 (3)	0.64 (64)	0.59 (34)	0	0.62 (41)	1 (1)
urban	0.52 (244)	0.53 (39)	0.53 (180)	0.52 (139)	0	0.43 (97)	0.58 (8)
violence	0.69 (174)	0.65 (20)	0.72 (132)	0.63 (90)	0	0.68 (76)	0.56 (9)
youth	0.61 (167)	0.52 (52)	0.72 (96)	0.55 (97)	0	0.63 (62)	1 (2)
NA	0.4 (5)	0	0.33 (3)	0 (1)	0	1 (3)	0

Code

coded_isbns_gender.df %>%
  group_by(dimension,category) %>%
  summarize(n=n(),p=n/sum(dy.ls), pr_F=mean(og_pr_F,na.rm=TRUE),
            .groups="drop") -> category_gender.df
  
{ category_gender.df %>%
    filter(p>lower_q) %>%
  ggplot(aes(y=pr_F,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="gender ratio of  each category, excluding rare")} %>%
  plotly::ggplotly()

Code

coded_isbns_wide_unpacked.df %>%
  group_by(subj) %>% 
  mutate(subj_total = length(unique(isbn)))  %>%
  group_by(subj,meth) %>%
  summarize(.groups="drop",
       n = length(unique(isbn)),
       percent_of_subject = n/unique(subj_total),
       pr_F= mean(distinct(data.frame(isbn,og_pr_F))[["og_pr_F"]], na.rm=TRUE   )
            ) -> coded_subj_freq.df

preprint formatted tables

Code

subject_totals.df

# A tibble: 30 × 2
   subj        subject_total
   <chr>               <int>
 1 class                 450
 2 crim                  419
 3 econ                  868
 4 education             558
 5 environment           317
 6 ethnicity             290
 7 family                606
 8 gender                563
 9 health                646
10 identity              413
# ℹ 20 more rows

Code

# proportions table
subject_totals.df %>% 
  filter(subj!="NA") %>%
  filter(!str_starts(subj,"other")) %>%
  left_join(by="subj", 
    meth_subj_freq.df %>%
    mutate(display=percent_of_subject) %>%
    select(subj,meth,display) %>%
    pivot_wider(names_from=meth,values_from=display, values_fill=0) %>%
    select(!"NA")) %>%
  left_join(by="subj", 
    scope_subj_freq.df %>%
    mutate(display=percent_of_subject) %>%
    select(subj,scope,display) %>%
    pivot_wider(names_from=scope,values_from=display, values_fill=0) %>%
    select(!"NA")
    ) %>%
  gt::gt() %>%
  gt::tab_header("Proportion of each subject by methodology and scope") %>%
  gt::cols_label(subj="") %>%
  gt::cols_hide(c("theoretical")) %>%
  gt::tab_spanner(
    label = "methodology",
    columns = c(
      "empirical","longitudinal","qual","quant","theory"
    )
  ) %>%
  gt::tab_spanner(
    label = "scope",
    columns = c(
      "us","world",
    )
  ) %>%
  gt::cols_label(subject_total="(N)") %>%
  gt::fmt_number(decimals=2) %>%
  gt::fmt_number(columns=subject_total,decimals=0) %>%
  gt::data_color(
     columns = !c("subj","subject_total"),
     direction = "row",
     method="numeric",
     domain=c(0,1),
     palette = "Oranges",
     na_color = "white"
   ) %>%
  gt::tab_footnote("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")

Proportion of each subject by methodology and scope
	(N)	methodology					scope
	(N)	empirical	longitudinal	qual	quant	theory	us	world
class	450	0.90	0.14	0.69	0.45	0.35	0.62	0.21
crim	419	0.86	0.12	0.63	0.46	0.35	0.61	0.20
econ	868	0.89	0.13	0.64	0.48	0.38	0.61	0.24
education	558	0.88	0.14	0.60	0.55	0.39	0.59	0.17
environment	317	0.90	0.12	0.70	0.41	0.39	0.56	0.29
ethnicity	290	0.89	0.14	0.61	0.59	0.38	0.71	0.17
family	606	0.90	0.17	0.58	0.50	0.35	0.59	0.17
gender	563	0.90	0.10	0.64	0.50	0.39	0.56	0.20
health	646	0.90	0.15	0.59	0.56	0.39	0.60	0.16
identity	413	0.91	0.06	0.78	0.36	0.40	0.53	0.23
immigration	271	0.90	0.13	0.68	0.45	0.35	0.71	0.33
inequality	1,081	0.90	0.12	0.65	0.50	0.37	0.61	0.20
lifecourse	202	0.93	0.30	0.43	0.61	0.38	0.66	0.13
media	238	0.91	0.06	0.72	0.39	0.43	0.52	0.24
movements	263	0.86	0.07	0.79	0.34	0.41	0.57	0.25
networks	257	0.88	0.13	0.60	0.50	0.57	0.56	0.23
orgs	563	0.88	0.08	0.79	0.39	0.38	0.60	0.22
politics	642	0.85	0.11	0.71	0.38	0.37	0.61	0.28
race	701	0.88	0.11	0.66	0.49	0.38	0.67	0.16
religion	118	0.86	0.09	0.62	0.53	0.35	0.65	0.26
rural	100	0.88	0.12	0.60	0.48	0.34	0.52	0.25
sexuality	185	0.91	0.05	0.77	0.39	0.37	0.55	0.17
stigma	90	0.93	0.03	0.71	0.38	0.46	0.48	0.13
urban	280	0.87	0.14	0.64	0.50	0.35	0.72	0.22
violence	202	0.86	0.10	0.65	0.45	0.38	0.62	0.21
youth	178	0.94	0.29	0.54	0.54	0.35	0.59	0.17
Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.

Code

coded_isbns_gender.df %>%
  filter(dimension=="subj") %>%
  group_by(category) %>%
  summarize(subject_gender=mean(og_pr_F,na.rm=TRUE),
            .groups="drop") %>%
  rename(subj=category) -> subject_genders.df


# gender table
subject_totals.df %>% 
  filter(subj!="NA") %>%
  filter(!str_starts(subj,"other")) %>%
  left_join(by="subj", subject_genders.df) %>%
  left_join(by="subj", 
    meth_subj_freq.df %>%
    mutate(display=pr_F) %>%
    select(subj,meth,display) %>%
    pivot_wider(names_from=meth,values_from=display, values_fill=0) %>%
    select(!"NA")) %>%
  left_join(by="subj", 
    scope_subj_freq.df %>%
    mutate(display=pr_F) %>%
    select(subj,scope,display) %>%
    pivot_wider(names_from=scope,values_from=display, values_fill=0) %>%
    select(!"NA")
    ) %>%
  bind_rows(
    category_gender.df %>% 
     filter(dimension %in% c("meth","scope")) %>% 
      select(category,pr_F) %>%
      pivot_wider(names_from=category,values_from=pr_F, values_fill=0) %>%
      mutate(subj="")
  ) %>%
  gt::gt() %>%
  gt::tab_header("Female proportion for each subject by method and scope") %>%
  gt::cols_label(subj="") %>%
  gt::cols_hide(c("theoretical")) %>%
  gt::tab_spanner(
    label = "methodology",
    columns = c(
      "empirical","longitudinal","qual","quant","theory"
    )
  ) %>%
  gt::tab_spanner(
    label = "scope",
    columns = c(
      "us","world",
    )
  ) %>%
  gt::cols_label(subject_total="(N)") %>%
  gt::cols_label(subject_gender="(subject mean)") %>%
  gt::fmt_number(decimals=2) %>%
  gt::fmt_number(columns=subject_total,decimals=0) %>%
  gt::sub_missing() %>%
  gt::data_color(
     columns = !c("subj","subject_total"),
     direction = "row",
     method="numeric",
     domain=c(0,1),
     palette = "RdBu",
     na_color = "white"
   ) %>% 
  gt::tab_row_group(
    label = "",
    rows = subj !="",
  ) %>%
  gt::tab_row_group(
    label = "Category Means",
    rows = subj =="",
  ) %>%
  gt::row_group_order(groups = c("", "Category Means")) %>%
  gt::tab_footnote("Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses. ")

Female proportion for each subject by method and scope
	(N)	(subject mean)	methodology					scope
	(N)	(subject mean)	empirical	longitudinal	qual	quant	theory	us	world

class	450	0.63	0.64	0.54	0.67	0.57	0.59	0.61	0.56
crim	419	0.59	0.61	0.49	0.65	0.56	0.59	0.57	0.49
econ	868	0.58	0.59	0.51	0.61	0.57	0.53	0.58	0.51
education	558	0.63	0.64	0.55	0.67	0.62	0.58	0.61	0.59
environment	317	0.54	0.55	0.57	0.55	0.53	0.46	0.53	0.55
ethnicity	290	0.59	0.60	0.69	0.62	0.59	0.59	0.62	0.52
family	606	0.66	0.67	0.58	0.70	0.62	0.61	0.64	0.64
gender	563	0.72	0.73	0.70	0.77	0.70	0.72	0.73	0.70
health	646	0.63	0.63	0.61	0.68	0.60	0.59	0.62	0.59
identity	413	0.61	0.62	0.62	0.63	0.60	0.61	0.60	0.51
immigration	271	0.60	0.61	0.61	0.64	0.56	0.57	0.57	0.55
inequality	1,081	0.62	0.63	0.54	0.65	0.62	0.59	0.60	0.54
lifecourse	202	0.64	0.66	0.62	0.69	0.62	0.56	0.62	0.74
media	238	0.54	0.55	0.42	0.60	0.44	0.49	0.55	0.49
movements	263	0.55	0.57	0.56	0.58	0.54	0.47	0.56	0.45
networks	257	0.51	0.54	0.53	0.55	0.51	0.47	0.49	0.41
orgs	563	0.59	0.60	0.42	0.63	0.56	0.52	0.58	0.49
politics	642	0.54	0.57	0.46	0.58	0.52	0.51	0.51	0.49
race	701	0.60	0.61	0.60	0.63	0.61	0.56	0.59	0.54
religion	118	0.48	0.49	0.15	0.55	0.43	0.47	0.48	0.39
rural	100	0.61	0.62	0.46	0.63	0.52	0.60	0.60	0.57
sexuality	185	0.65	0.65	0.66	0.66	0.66	0.65	0.65	0.52
stigma	90	0.63	0.61	0.33	0.64	0.59	0.62	0.65	0.65
urban	280	0.52	0.52	0.53	0.53	0.52	0.43	0.49	0.45
violence	202	0.67	0.69	0.65	0.72	0.63	0.68	0.65	0.61
youth	178	0.63	0.61	0.52	0.72	0.55	0.63	0.56	0.72
Category Means
	—	—	0.58	0.53	0.61	0.56	0.54	0.56	0.51
Categories are overlapping -- percentages may exceed 100%. Number of dissertations in category in parentheses.

Code

library(ggalluvial)

coded_isbns_wide.df %>%
  count(`subj:econ`,`meth:qual`,`claim:strong`) %>%
   mutate(`subj:econ`=as.factor(`subj:econ`) 
          %>% fct_recode( econ="TRUE",other_subj="FALSE"),
          `meth:qual`=as.factor(`meth:qual`)  %>%
            fct_recode( qualitative="TRUE",other_meth="FALSE"),
         `claim`=as.factor(`claim:strong`)  %>%
           fct_recode( strong="TRUE",not_strong="FALSE")
          ) %>%
  as.data.frame() %>%
  ggplot(aes(
    y=n,
    axis1=`subj:econ`,
    axis2=`meth:qual`,
  )) +
  geom_alluvium(aes(fill = claim)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("econ", "qualitative"))

Code

vcd::mosaic( ~ `subj:econ` + `meth:qual` +`claim:strong`, data = coded_isbns_wide.df, shade = FALSE)