Network Analysis for RADx-UP Scientific Collaboration (or coauthorship)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Note that the echo = FALSE parameter was added to code chunks to prevent printing of the R code that generated the plot.

1. Output Author Affiliation Data - No Missing Author IDs

# Goal: to create author IDs for missing cases and output back to the same dataset 

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("readxl")

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(readxl)

# read in (or import) raw Author Affiliation Excel file:
author_affil <- read_excel("2021.09.23 Author_Affiliations.xlsx")

# create new variable (i.e., author_id2) (in a new dataframe, author-affil) where, if author_id is missing, enter the author's name for author_id; else print the existing author_id:
author_affil <- author_affil %>%
  mutate(author_id2 = case_when(is.na(author_id) ~ author_name, 
                                TRUE ~ author_id))
# check--are there any missing author_id or author_id2?:
author_affil %>%
  filter(is.na(author_id)) %>%
  select(author_id, author_id2)

## # A tibble: 0 × 2
## # … with 2 variables: author_id <chr>, author_id2 <chr>

author_affil %>%
  filter(is.na(author_id2)) %>%
  select(author_id, author_id2)

## # A tibble: 0 × 2
## # … with 2 variables: author_id <chr>, author_id2 <chr>

# output (or export) the author_affil dataframe you created to a *.csv file:
write_csv(author_affil, "2021.09.23 Author_Affiliations_no_missing_ids.csv")

3. Create Author Attribute Data

# Goal: to reduce the recoded Author Affiliations *.csv file (see #1.) to just author name, author ID, and one organizational type per author

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("readxl")

#library(tidyverse)
#library(readxl)

# read in (or import) recoded Author Affiliation *.csv file (if not already in the environment:
#author_affil <- read_csv("2021.09.23 Author_Affiliations_no_missing_ids.csv")

# Optional?: only keep pubs that are in the content analysis data set? read in the content analysis data for the publications:
#content <- read_excel("2021.09.23 Content_Analysis-CLEAN.xlsx")

# Optional? sort author affiliation dataframe by pmid variable (?):
#authors_aff <- authors_aff %>% 
#  filter(pmid %in% content$pmid)

# duplicate authors may have authored multiple papers, but we only need them once since we're only interested in the attributes of the author. create a new data set (i.e., authors_aff_unique) that is one row per author_id for cases with a single organizational type (org):
author_affil_unique <- author_affil %>%
  # select useful columns to keep in the dataframe:
    select(author_name, author_id, author_id2, author_type) %>%
  # if the following variables are distinct, then keep the values (?):
    distinct(author_id, author_id2, author_type, .keep_all = TRUE)

author_affil_unique <- author_affil_unique %>% 
# sort by the variable, author_id2:
  group_by(author_id2) %>% 
# create a variable (i.e., type_num) that numbers how many times author_id2 is in the dataframe with a unique org type:
  mutate(type_num = 1:n())
# if beyond 1, then the org type was different on different rows for the same author (e.g., author_id2 "7005932893" was listed as "academic" and "healthcare" on separate papers) 

# check--create a new dataframe (i.e., multi_type) that pulls the author_id2 of rows where the type_num value is more than 1 (that you'll need to assign a main or final type):
multi_type <- author_affil_unique %>%
  filter(type_num > 1) %>%
  select(author_id2) 

# sort the author_id2 in this dataframe (?):
 author_affil_unique %>%
  filter(author_id2 %in% multi_type$author_id2) %>%
  arrange(author_id2)

## # A tibble: 2 × 5
## # Groups:   author_id2 [1]
##   author_name author_id  author_id2 author_type type_num
##   <chr>       <chr>      <chr>      <chr>          <int>
## 1 Klonoff DC  7005932893 7005932893 Academic           1
## 2 Klonoff DC  7005932893 7005932893 Healthcare         2

# create a new dataframe (i.e., authors_aff_wide) that pivots data to wide format, with a new column (prefixed "type") for multiple org types per author
 #(long format had extra/duplicate rows for each type per author; wide format creates one row per author with multiple types as columns) :
author_affil_wide <- author_affil_unique %>%
  pivot_wider(names_from = type_num, values_from = c(author_type), names_prefix = "type")

# factor the type1 & type2 variables in this hierarchy (i.e., community = level 1, school = level 2, etc.): 
author_affil_wide <-author_affil_wide %>%
  mutate(type1_factor = factor(type1, levels=c("Community", 
                                               "School",
                                               "Government",
                                               "Professional", 
                                               "Healthcare", 
                                               "Academic")),
         type2_factor = factor(type2, levels=c("Community", 
                                               "School",
                                               "Government",
                                               "Professional", 
                                               "Healthcare", 
                                               "Academic")))
# thus we would prioritize "healthcare" over "academic" for author "7005932893"

# create a new variable (i.e., final_type) that assigns a main org type for authors with multiple affiliations, 
# based on the following pre-determined rules:
author_affil_wide <- author_affil_wide %>%
  mutate(final_type = case_when( 
   # if only type1 value exists, then assign the value of type1:
                          is.na(type2) ~ type1,
   # if type1 value is missing but type2 exists, then assign the value of type2: 
                          is.na(type1) & !is.na(type2) ~ type2,
   # if both type1 and type2 exist, then assign based on the hierarchy of the factored variables:
                          as.numeric(type1_factor) < as.numeric(type2_factor) 
                          & (!is.na(type1) & !is.na(type2)) ~ type1,
                          as.numeric(type2_factor) < as.numeric(type1_factor)  
                          & (!is.na(type1) & !is.na(type2)) ~ type2,
                          TRUE ~ type1)) %>%
   #select useful columns to keep in the dataframe (i.e., everything but the factor variables used to compute other variables):
    select(!c("type1_factor", "type2_factor"))

# check--summarize cross-tab counts of each org type (not sure what to look for in the cross-check):
table(author_affil_wide$final_type, 
      author_affil_wide$type1,
      useNA="always")

##               
##                Academic Community Government Healthcare Professional School
##   Academic          146         0          0          0            0      0
##   Community           0         1          0          0            0      0
##   Government          0         0          4          0            0      0
##   Healthcare          1         0          0         14            0      0
##   Professional        0         0          0          0           12      0
##   School              0         0          0          0            0      1
##   <NA>                0         0          0          0            0      0
##               
##                <NA>
##   Academic        0
##   Community       0
##   Government      0
##   Healthcare      0
##   Professional    0
##   School          0
##   <NA>            0

table(author_affil_wide$final_type, 
      author_affil_wide$type2,
      useNA="always")

##               
##                Healthcare <NA>
##   Academic              0  146
##   Community             0    1
##   Government            0    4
##   Healthcare            1   14
##   Professional          0   12
##   School                0    1
##   <NA>                  0    0

table(author_affil_wide$final_type, 
      author_affil_wide$type1, 
      author_affil_wide$type2,
      useNA="always")

## , ,  = Healthcare
## 
##               
##                Academic Community Government Healthcare Professional School
##   Academic            0         0          0          0            0      0
##   Community           0         0          0          0            0      0
##   Government          0         0          0          0            0      0
##   Healthcare          1         0          0          0            0      0
##   Professional        0         0          0          0            0      0
##   School              0         0          0          0            0      0
##   <NA>                0         0          0          0            0      0
##               
##                <NA>
##   Academic        0
##   Community       0
##   Government      0
##   Healthcare      0
##   Professional    0
##   School          0
##   <NA>            0
## 
## , ,  = NA
## 
##               
##                Academic Community Government Healthcare Professional School
##   Academic          146         0          0          0            0      0
##   Community           0         1          0          0            0      0
##   Government          0         0          4          0            0      0
##   Healthcare          0         0          0         14            0      0
##   Professional        0         0          0          0           12      0
##   School              0         0          0          0            0      1
##   <NA>                0         0          0          0            0      0
##               
##                <NA>
##   Academic        0
##   Community       0
##   Government      0
##   Healthcare      0
##   Professional    0
##   School          0
##   <NA>            0

# output (or export) the authors_aff_wide dataframe you created to a *.csv file:
write_csv(author_affil_wide, "2021.09.23 Author_Attribute_Data.csv")

2. Output Publication Data - With Dates

# Goal: to add variables concerning date of publication and output back to the same dataset

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("readxl")
#install.packages("lubridate")

#library(tidyverse)
#library(readxl)
library(lubridate)

## Loading required package: timechange

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# read in (or import) raw Publication *.csv file:
pubs <- read_excel("2021.09.23 Publications.xlsx")

# optional: missing values for the variables year_cite, month, or day will result in cases failing to parse. replace NA in the variable, day, with the value of 1 before creating new variables with day:
#pubs$day[is.na(pubs$day)] <- 1

# create new variables (i.e., year_month_date, year_month, date_publish) that strings existing variables (i.e., year_cite, month, day) in YMD format, separated by single spaces:
pubs <- pubs %>% 
  mutate(year_month_date = paste(year_cite, month, day, sep= ' '),
         year_month = paste(year_cite, month, sep= ' '),
  # if day is NA, use the new year_month_day variable into YMD format:
         date_publish = case_when(!is.na(day) ~ ymd(year_month_date),
  # if day and month are NA, use the new year_month variable into YM format:
                                  is.na(day) & !is.na(month) ~ ym(year_month)) )

## Warning: 13 failed to parse.

# create a new variable (i.e, weeks_from_start) that computes the age of RADx-UP at the time of each publication. this will be useful for summary statistics, plots, etc. at different cut points (or scales) of time.

# define 'start_date' of RADx-UP, roughly September 2020:
start_date <- "2020-08-27"

# calculate weeks from start_date:
pubs <- pubs %>%
  mutate(weeks_from_start = 
           as.numeric(difftime(date_publish, start_date, units = "weeks")))

# create a function (i.e., month_function) to define how long a "month" is (?):
month_function <- function(start_date, end_date) {
  result <- NA
  if (!is.na(end_date)){
    result <- length(seq.Date(as.Date(start_date), end_date, by = "month"))}
  result
}

# apply month_function to the dataframe (i.e., pubs) to create a new variable (i.e., months_from_start) for how many months old the pubs are:
pubs <- pubs %>%
  mutate(months_from_start = 
           as.numeric(lapply(date_publish, month_function, start_date=start_date)))

# select useful columns to keep in the dataframe 
# (i.e., everything but the YMD and YM format variables used to compute other variables):
pubs <- pubs  %>% 
  select(!c(year_month_date, year_month))

# output (or export) the pubs dataframe you created to a *.csv file:
write_csv(pubs, "2021.09.23 Publications_with_date_variables.csv")

4. Create Publication Attribute Data

# Goal: to clean raw content analysis data and merge with recoded Publication *.csv file (see #2.) 

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("readxl")
#install.packages("lubridate")

#library(tidyverse)
#library(readxl)
#library(lubridate)

# read in raw content analysis Excel file for the publications:
content <- read_excel("2021.09.23 Content_Analysis-CLEAN.xlsx")

## New names:
## • `Rural` -> `Rural...21`
## • `Rural` -> `Rural...29`

################################################################################

# create a new dataframe (i.e., content_recode) that renames long variables that were survey items:
content_recode <- content %>%
  rename(health_equity_addressed = `Health Equity Focus`, 
         type_of_analysis  = `Type of Analysis Identify for analytical studies whether the type of data analysis was qualitative, quantitative, both (mixed methods, literature review, or editorial.`) %>%  
  mutate(type_of_analysis = case_when(
    type_of_analysis == "N/A" ~ as.character(NA),
    TRUE ~ type_of_analysis)) 

# check--summarizes counts of each type of analysis (make sure it sums up to the number of papers in the analytical dataset):
table(content_recode$type_of_analysis)

## 
## Clinical case study           Editorial         Qualitative        Quantitative 
##                   1                   5                   3                  16

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_urban_rural).
# then create additional dichotomous variables (i.e., target_pop_urban, target_pop_rural), based on the created urban_rural variable so that "Both" counts as "target_pop_urban" and as "target_pop_rural":
content_recode <- content_recode %>% 
  rename(Rural = Rural...29) %>%
  mutate(UP_urban_rural = case_when( 
                                  (Rural == 1 & Urban==1) ~ "Both",
                                  (Rural == 0 & Urban==1) ~ "Urban",
                                  (Rural == 1 & Urban==0) ~ "Rural",
                                  (Rural == 0 & Urban==0) ~ as.character(NA)),
         target_pop_urban = case_when(
           UP_urban_rural %in% c("Both","Urban") ~ 1,
           TRUE ~ 0),
         target_pop_rural = case_when(
           UP_urban_rural%in% c("Both","Rural") ~ 1,
           TRUE ~ 0))

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_urban_rural, 
      content_recode$Urban, content_recode$Rural)

## , ,  = 0
## 
##        
##         0 1
##   Both  0 0
##   Rural 0 0
##   Urban 0 3
## 
## , ,  = 1
## 
##        
##         0 1
##   Both  0 1
##   Rural 2 0
##   Urban 0 0

table(content_recode$Urban, 
      content_recode$target_pop_urban)

##    
##      0  1
##   0 21  0
##   1  0  4

table(content_recode$Rural,
      content_recode$target_pop_rural)

##    
##      0  1
##   0 22  0
##   1  0  3

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_age):
content_recode <- content_recode %>%
  mutate(UP_age = case_when(
         `Children and adolescents (< 18 yo)` == 1 ~ "UP_children",
         `Older Adults`  == 1 ~ "UP_elders",
          TRUE ~ "not_UP_age")) %>%
# rename the dichotomous/check-all columns:
  rename(target_pop_children = `Children and adolescents (< 18 yo)`, 
         target_pop_elders = `Older Adults` )

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_age,
      content_recode$target_pop_children)

##              
##                0  1
##   not_UP_age  17  0
##   UP_children  0  1
##   UP_elders    7  0

table(content_recode$UP_age,
      content_recode$target_pop_elders)

##              
##                0  1
##   not_UP_age  17  0
##   UP_children  1  0
##   UP_elders    0  7

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_SES): 
content_recode <- content_recode %>%
  mutate(UP_SES = case_when(
    `Low socioeconomic status` == 1 ~ "UP_low_SES",
    `Low socioeconomic status` == 0 ~ "not_UP_SES")) %>%
# rename the dichotomous/check-all columns:
  rename(target_pop_SES = `Low socioeconomic status`)

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_SES,
      content_recode$target_pop_SES)

##             
##               0  1
##   not_UP_SES 22  0
##   UP_low_SES  0  3

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_HCW):
content_recode <- content_recode %>%
  mutate(UP_HCW = case_when(
    `Health care worker` == 1 ~ "UP_healthcare_worker",
    `Health care worker` == 0 ~ "not_UP_healthcare_worker")) %>%
# rename the dichotomous/check-all columns:
  rename(target_pop_HCW = `Health care worker`)

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_HCW,
      content_recode$target_pop_HCW)

##                           
##                             0  1
##   not_UP_healthcare_worker 20  0
##   UP_healthcare_worker      0  5

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_teacher):
content_recode <- content_recode %>%
  mutate(UP_teacher = case_when(
    Teacher == 1 ~ "UP_teacher",
    Teacher == 0 ~ "not_UP_teacher")) %>%
# rename the dichotomous/check-all columns:
  rename(target_pop_teacher = Teacher)

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_teacher, 
      content_recode$target_pop_teacher)

##                 
##                   0  1
##   not_UP_teacher 24  0
##   UP_teacher      0  1

################################################################################

# create a new variable that recodes the dichotomous/check-all column options into a single categorical variable (i.e., UP_race):
content_recode <- content_recode %>%
  mutate(UP_race = case_when( 
    (`Asian American` == 1 |
     `Native Hawaiian` == 1 |
     `Pacific Islander` == 1) & 
     `Black or African American` == 0 & 
     `Hispanic or Latino`== 0 ~ "UP_Asian",
    
    (`American Indian` == 1 |
     `Alaska Native` == 1) & 
     `Black or African American` == 0 & 
     `Hispanic or Latino`== 0 ~ "UP_Indigenous",
    
     `Black or African American` == 1 & 
     `Hispanic or Latino` == 0 ~ "UP_Black",
    
     `Black or African American` == 0 & 
     `Hispanic or Latino` == 1 ~ "UP_Latinx",
    
     `Black or African American` == 1 & 
     `Hispanic or Latino` == 1 & 
     `American Indian` == 0 & 
     `Asian American` == 0 ~ "UP_Black Latinx",
    
     `Black or African American` == 1 & 
     `Hispanic or Latino` == 1 & 
     `American Indian` == 1 & 
     `Asian American` == 0 ~ "UP_Black Indigenous Latinx",
    
     `Black or African American` == 1 &
     `Hispanic or Latino` == 1 & 
     `Asian American` == 1 &
     `American Indian` == 0 ~ "UP_Black Latinx Asian",
    
      TRUE ~ "no_race_target"),
         
    target_race_binary = case_when(
      UP_race == "no_race_target" ~ 0,
      TRUE ~ 1)) %>%
# rename the dichotomous/check-all columns:
  rename(target_pop_Black = `Black or African American`,
         target_pop_Latinx = `Hispanic or Latino`,
         target_pop_Indigenous = `American Indian`,
         target_pop_Asian = `Asian American`)

# check--summarizes cross-tab counts of new categorical variable to previous dichotomous variables:
table(content_recode$UP_race,
      content_recode$target_pop_Black)

##                             
##                               0  1
##   no_race_target             14  0
##   UP_Black                    0  1
##   UP_Black Indigenous Latinx  0  1
##   UP_Black Latinx             0  5
##   UP_Indigenous               2  0
##   UP_Latinx                   2  0

table(content_recode$UP_race,
      content_recode$target_pop_Latinx)

##                             
##                               0  1
##   no_race_target             14  0
##   UP_Black                    1  0
##   UP_Black Indigenous Latinx  0  1
##   UP_Black Latinx             0  5
##   UP_Indigenous               2  0
##   UP_Latinx                   0  2

table(content_recode$UP_race, 
      content_recode$target_pop_Indigenous)

##                             
##                               0  1
##   no_race_target             14  0
##   UP_Black                    1  0
##   UP_Black Indigenous Latinx  0  1
##   UP_Black Latinx             5  0
##   UP_Indigenous               0  2
##   UP_Latinx                   2  0

table(content_recode$UP_race, 
      content_recode$target_pop_Asian)

##                             
##                               0
##   no_race_target             14
##   UP_Black                    1
##   UP_Black Indigenous Latinx  1
##   UP_Black Latinx             5
##   UP_Indigenous               2
##   UP_Latinx                   2

table(content_recode$UP_race, 
      content_recode$target_race_binary)

##                             
##                               0  1
##   no_race_target             14  0
##   UP_Black                    0  1
##   UP_Black Indigenous Latinx  0  1
##   UP_Black Latinx             0  5
##   UP_Indigenous               0  2
##   UP_Latinx                   0  2

################################################################################

# create a new dataframe (i.e., site_recode) that keeps and renames dichotomous/check-all variables:
site_recode <- content_recode %>%
  select(community_health_center = `Community health center`,
         hospital = `Hospital` ,
         in_home = `In-home` ,
         prison = `Prison/jail` ,
         care_facility = `Residential care facility`,
         school = `School` ,
         other = `Other Setting 1`) %>%
  mutate(hospital = case_when(
           other == "university-based hospital system" ~ 1,
           TRUE ~ hospital),
    
         community_events = case_when(
           other %in% c("Community organization",
                        "Free community-based Latino testing events",
                        "Street outreach") ~ 1 ,
           TRUE ~ 0),
         
         vaccination_sites = case_when(
           other %in% c("Vaccination sites: churches, clinics, and community events were the COVID-19 vaccine was being administered",
                        "Vaccination sites [outpatient primary care clinics, churches, community events, and outdoor vaccination drive through locations]") ~ 1 ,
           TRUE~0),
         
         other = case_when(
           is.na(other) | 
           hospital == 1 | 
           community_events == 1|
           vaccination_sites == 1 ~ 0,
           !is.na(other) ~ 1))

# create a function (i.e., which_site) to help with recoding... (?):
which_site <- function(x, y){names(which(x==y))[1]}

#here we locate which column (e.g., community health center, school, etc.)
#there is a value of 1 and grab the column's name for the new categorical variable, study_site 

site_recode <- site_recode  %>%
  mutate(study_site = apply(site_recode, 1, which_site, y = 1))

# note that a few publications had multiple sites
# "community_health_center" "other": coded as community_health_center
# "hospital"      "care_facility": coded as hospital

# add the categorical study_site variable to the content_recode dataframe:
content_recode <- content_recode %>% 
  mutate(study_site = site_recode$study_site)

#quick check:
table(content_recode$study_site, 
      content_recode$`School`)

##                   
##                    0 1
##   care_facility    8 0
##   community_events 1 0
##   hospital         2 0
##   school           0 1

table(content_recode$study_site, 
      content_recode$`Community health center`)

##                   
##                    0
##   care_facility    8
##   community_events 1
##   hospital         2
##   school           1

################################################################################

# create a new dataframe (i.e., design_recode) that keeps and renames dichotomous/check-all variables:
design_recode <- content_recode %>%
  select(experimental = `Experimental`,
         quasi_experimental = `Quasi-Experimental` ,
         simulation = `Simulation` ,
         observational = `Observational` ,
         evaluation = `Evaluation` ,
         formative = `Formative/exploratory`,
         dissemination = `Dissemination & implementation`,
         cost_benefit = `Cost-benefit` ,
         quant_other = `Other Quantitative Method`,
         focus_group = `Focus group` ,
         interview = `Interview`,
         survey = `Survey`,
         qual_other = `Other Qualitative`)

# create a function (i.e., which_design) to help with recoding... (?):
which_design <-function(x, y){paste(names(which(x==y)), collapse = ".")}

# here, we take each study design, note if the pub used it and paste it with other study design types, since studies can have multiple study designs (?)

design_recode <- design_recode  %>%
  mutate(study_design = apply(design_recode , 1, which_design, y = 1) ,
         study_design = case_when(
           study_design == "" ~ "no_study_design",
           TRUE ~ study_design))

# bind the whole design_recode dataframe to the content_recode dataframe:
content_recode <- bind_cols(content_recode, design_recode,
                            .name_repair = "minimal")

#quick check:
table(design_recode$study_design, 
      design_recode$observational)

##                                
##                                  0  1
##   experimental                   1  0
##   focus_group                    2  0
##   focus_group.interview          1  0
##   no_study_design                6  0
##   observational                  0 13
##   quasi_experimental.evaluation  1  0
##   simulation                     1  0

################################################################################

# create a new dataframe (i.e., engage_recode) that keeps and renames dichotomous/check-all variables and dummies comments in "other":
engage_recode <- content_recode %>%
  select(engage_broadcast_media = `Broadcast media (e.g., press conferences, TV interviews`,
         engage_entertainment = `Entertainment activities (e.g., picnics, raffles` ,
         engage_focus_groups = `Focus groups and/or surveys` ,
         engage_meetings_events = `In-person or online community presentations or meetings (professional or educational events` ,
         engage_social_media = `Internet/social media (e.g., email blast, blog, Facebook, YouTube`,
         engage_partnerships = `Partnerships with community-based organizations` ,
         engage_print_media = `Print media (e.g., press releases, newspaper articles`,
         engage_other1 = `Other Community Engagement 1`,
         engage_other2 = `Other Community Engagement 2`) %>%
  mutate(engage_advisory_board = case_when(
           engage_other1 %in% c("ad-hoc advisory group of community leaders in each SYCT county todevelop a community-informed strategy for distribution of at-home test kits",
                                "Community Advisory Boards") ~ 1, 
                                TRUE ~ 0),
         engage_peer_influence = case_when(
           engage_other1 %in% c("Identifying frontline champions",
                                "Peer influencers",
                                "Community member referral networks") ~ 1, 
                                 TRUE ~ 0),
         engage_transportation = case_when(
           engage_other1 %in% c("Street outreach; mobile vans",
                                "transportation to vaccination sites;  a network of public health/clinical health service providers to provide vaccinations; widespread vaccinations sites (ex. malls; removal of residential status as a barrier to obtain vaccines; public health worker delivery of vaccines in rural areas") ~ 1, 
                                 TRUE ~ 0),
         engage_partnerships = case_when(
           engage_other1 %in% c("COVID-19 data collection through the partnership for decision-making",
                                "Multi-disciplinary  collaborative  research and partnerships  focused  on  addressing community needs and building a translational science workforce to enhance project implementation and reach; faculty  members  engage  closely  with  local  health  departments,",
                                "Conducted an Asian-American and Native Hawaiian/Pacific Islander COVID-19 Needs Assessment") ~ 1,
                                TRUE ~ engage_partnerships),
         engage_meetings_events = case_when(
           engage_other1 %in% c("Free community-based Latino testing events") ~ 1,
                                TRUE ~ engage_meetings_events),
         engage_other1 = case_when(
           is.na(engage_other1) | 
           engage_advisory_board == 1 |
           engage_peer_influence == 1 |
           engage_partnerships == 1 |
           engage_meetings_events == 1 |
           engage_transportation == 1 ~ 0,
           !is.na(engage_other1) ~ 1),
         engage_other2 = case_when(is.na(engage_other2) ~ 0,
           !is.na(engage_other2) ~ 1))
 
engage_recode <- engage_recode   %>%
  mutate(engage_number = apply(engage_recode[, ], 1, sum),
         engage_binary = 1 * (engage_number > 0),
         cmty_engage = apply(engage_recode, 1, which_design, y = 1),
         cmty_engage = case_when(
           cmty_engage == " " ~ "no_engagement",
           TRUE ~ cmty_engage))

print(engage_recode, width=100000, n =15)

## # A tibble: 25 × 15
##    engage_broadcast_media engage_entertainment engage_focus_groups
##                     <dbl>                <dbl>               <dbl>
##  1                      0                    0                   0
##  2                      0                    0                   0
##  3                      0                    0                   0
##  4                      0                    0                   0
##  5                      0                    0                   1
##  6                      1                    0                   0
##  7                      0                    0                   0
##  8                      0                    0                   0
##  9                      0                    0                   1
## 10                      0                    0                   0
## 11                      0                    0                   0
## 12                      0                    0                   0
## 13                      0                    0                   0
## 14                      0                    0                   0
## 15                      0                    0                   0
##    engage_meetings_events engage_social_media engage_partnerships
##                     <dbl>               <dbl>               <dbl>
##  1                      0                   0                   0
##  2                      0                   0                   0
##  3                      0                   0                   0
##  4                      1                   0                   1
##  5                      1                   0                   0
##  6                      0                   1                   1
##  7                      0                   0                   0
##  8                      0                   0                   0
##  9                      0                   0                   0
## 10                      0                   0                   0
## 11                      0                   0                   0
## 12                      0                   0                   0
## 13                      0                   0                   0
## 14                      0                   0                   0
## 15                      0                   0                   0
##    engage_print_media engage_other1 engage_other2 engage_advisory_board
##                 <dbl>         <dbl>         <dbl>                 <dbl>
##  1                  0             0             0                     0
##  2                  0             0             0                     0
##  3                  0             0             0                     0
##  4                  0             0             0                     0
##  5                  0             0             0                     0
##  6                  0             0             0                     0
##  7                  0             0             0                     0
##  8                  0             0             0                     0
##  9                  0             0             0                     0
## 10                  0             0             0                     0
## 11                  0             0             0                     0
## 12                  0             0             0                     0
## 13                  0             0             0                     0
## 14                  0             0             0                     0
## 15                  0             0             0                     0
##    engage_peer_influence engage_transportation engage_number engage_binary
##                    <dbl>                 <dbl>         <dbl>         <dbl>
##  1                     0                     0             0             0
##  2                     0                     0             0             0
##  3                     0                     0             0             0
##  4                     0                     0             2             1
##  5                     0                     0             2             1
##  6                     0                     0             3             1
##  7                     0                     0             0             0
##  8                     0                     0             0             0
##  9                     0                     0             1             1
## 10                     0                     0             0             0
## 11                     0                     0             0             0
## 12                     0                     0             0             0
## 13                     0                     0             0             0
## 14                     0                     0             0             0
## 15                     0                     0             0             0
##    cmty_engage                                                     
##    <chr>                                                           
##  1 ""                                                              
##  2 ""                                                              
##  3 ""                                                              
##  4 "engage_meetings_events.engage_partnerships"                    
##  5 "engage_focus_groups.engage_meetings_events"                    
##  6 "engage_broadcast_media.engage_social_media.engage_partnerships"
##  7 ""                                                              
##  8 ""                                                              
##  9 "engage_focus_groups"                                           
## 10 ""                                                              
## 11 ""                                                              
## 12 ""                                                              
## 13 ""                                                              
## 14 ""                                                              
## 15 ""                                                              
## # … with 10 more rows

table(engage_recode$cmty_engage)

## 
##                                                                                                         
##                                                                                                      17 
##                                          engage_broadcast_media.engage_social_media.engage_partnerships 
##                                                                                                       1 
##                                                                                     engage_focus_groups 
##                                                                                                       2 
##                                                              engage_focus_groups.engage_meetings_events 
##                                                                                                       1 
## engage_focus_groups.engage_meetings_events.engage_social_media.engage_print_media.engage_peer_influence 
##                                                                                                       1 
##                                                                 engage_focus_groups.engage_partnerships 
##                                                                                                       1 
##                                                              engage_meetings_events.engage_partnerships 
##                                                                                                       1 
##                                                                                     engage_partnerships 
##                                                                                                       1

table(engage_recode$cmty_engage, 
      engage_recode$engage_binary)

##                                                                                                          
##                                                                                                            0
##                                                                                                           17
##   engage_broadcast_media.engage_social_media.engage_partnerships                                           0
##   engage_focus_groups                                                                                      0
##   engage_focus_groups.engage_meetings_events                                                               0
##   engage_focus_groups.engage_meetings_events.engage_social_media.engage_print_media.engage_peer_influence  0
##   engage_focus_groups.engage_partnerships                                                                  0
##   engage_meetings_events.engage_partnerships                                                               0
##   engage_partnerships                                                                                      0
##                                                                                                          
##                                                                                                            1
##                                                                                                            0
##   engage_broadcast_media.engage_social_media.engage_partnerships                                           1
##   engage_focus_groups                                                                                      2
##   engage_focus_groups.engage_meetings_events                                                               1
##   engage_focus_groups.engage_meetings_events.engage_social_media.engage_print_media.engage_peer_influence  1
##   engage_focus_groups.engage_partnerships                                                                  1
##   engage_meetings_events.engage_partnerships                                                               1
##   engage_partnerships                                                                                      1

# bind the whole design_recode dataframe to the content_recode dataframe:
content_recode <- bind_cols(content_recode, engage_recode,
                            .name_repair = "minimal")

################################################################################

# TSBM (translational science benefits model): vaccines

# need to create 4 basic variables of 1 = did translational science model
# and 0 = did not do it
# then create variable of combinations, number of TSMB as well as binary indicator

# here for vaccines

# here we recode the main variables for clinical, public health, economic
# and policy. recode so that a 1 is did that benefit and 0 otherwise 

content_recode <- content_recode %>%
  mutate(
    TSBM_vaccines_clinical = case_when(
      (`Vaccination Clinical Guidelines` == 1 |
       `Vaccination Procedures` == 1 |
       `Vaccine Technology`== 1) ~ 1,
        TRUE ~ 0),
    TSBM_vaccines_public_health = case_when(
      (`Community Vaccination Services` == 1 |
       `Vaccine Education Resources` == 1 |
       `Vaccination Accessibility`== 1|
       `Vaccine Delivery and Uptake`== 1|
       `Vaccine Hesitancy`== 1|
       `Software and Digital Health for Vaccination`==1|
       `Public Health Vaccination Practices`==1) ~ 1,
        TRUE ~ 0),
   TSBM_vaccines_economic = case_when(
     (`Vaccine License Agreements and Patents` == 1 |
      `Vaccine Non-Profit or Commercial Entities` == 1 |
      `Vaccine Cost Effectiveness`== 1|
      `Vaccine Cost Savings`== 1) ~ 1,
      TRUE ~ 0),
  TSBM_vaccines_policy = case_when(
    (`Vaccination Advisory Activities` == 1 |
     `Vaccination Policies and Legislation` == 1) ~ 1,
      TRUE ~ 0))

#now creating extra variables, number of TSBM benefits, pasted together
#benefits they did and binary indicator if did any benefit

TSBM_vaccine_columns <- c("TSBM_vaccines_clinical", 
                          "TSBM_vaccines_public_health",
                          "TSBM_vaccines_economic", 
                          "TSBM_vaccines_policy")

content_recode <- content_recode   %>%
  mutate(TSBM_vaccines_number = apply(content_recode[, TSBM_vaccine_columns], 1, sum),
         TSBM_vaccines_binary = 1 * (TSBM_vaccines_number > 0),
         TSBM_vaccines_recode = apply(content_recode[, TSBM_vaccine_columns], 1, which_design, y = 1) ,
         TSBM_vaccines_recode = case_when(TSBM_vaccines_recode == "" ~ "no_TSBM_vaccines",
                                             TRUE ~ TSBM_vaccines_recode))

#quick check:
print(content_recode[,c(TSBM_vaccine_columns, 
                        "TSBM_vaccines_binary", 
                        "TSBM_vaccines_recode", 
                        "TSBM_vaccines_number")],
      width=1000)

## # A tibble: 25 × 7
##    TSBM_vaccines_clinical TSBM_vaccines_public_health TSBM_vaccines_economic
##                     <dbl>                       <dbl>                  <dbl>
##  1                      0                           0                      0
##  2                      0                           0                      0
##  3                      0                           0                      0
##  4                      1                           0                      0
##  5                      0                           1                      0
##  6                      0                           0                      0
##  7                      0                           0                      0
##  8                      0                           1                      0
##  9                      0                           1                      0
## 10                      0                           0                      0
##    TSBM_vaccines_policy TSBM_vaccines_binary TSBM_vaccines_recode       
##                   <dbl>                <dbl> <chr>                      
##  1                    0                    0 no_TSBM_vaccines           
##  2                    0                    0 no_TSBM_vaccines           
##  3                    0                    0 no_TSBM_vaccines           
##  4                    0                    1 TSBM_vaccines_clinical     
##  5                    0                    1 TSBM_vaccines_public_health
##  6                    0                    0 no_TSBM_vaccines           
##  7                    0                    0 no_TSBM_vaccines           
##  8                    0                    1 TSBM_vaccines_public_health
##  9                    0                    1 TSBM_vaccines_public_health
## 10                    0                    0 no_TSBM_vaccines           
##    TSBM_vaccines_number
##                   <dbl>
##  1                    0
##  2                    0
##  3                    0
##  4                    1
##  5                    1
##  6                    0
##  7                    0
##  8                    1
##  9                    1
## 10                    0
## # … with 15 more rows

table(content_recode$TSBM_vaccines_binary, 
      content_recode$TSBM_vaccines_number)

##    
##      0  1  2
##   0 10  0  0
##   1  0 12  3

table(content_recode$TSBM_vaccines_binary, 
      content_recode$TSBM_vaccines_recode)

##    
##     no_TSBM_vaccines TSBM_vaccines_clinical
##   0               10                      0
##   1                0                      4
##    
##     TSBM_vaccines_clinical.TSBM_vaccines_public_health
##   0                                                  0
##   1                                                  1
##    
##     TSBM_vaccines_public_health
##   0                           0
##   1                           8
##    
##     TSBM_vaccines_public_health.TSBM_vaccines_economic
##   0                                                  0
##   1                                                  1
##    
##     TSBM_vaccines_public_health.TSBM_vaccines_policy
##   0                                                0
##   1                                                1

################################################################################

# TSBM (translational science benefits model): testing

content_recode <- content_recode %>%
  mutate(
    TSBM_testing_clinical = case_when(
      (`Testing Clinical Guidelines` == 1 |
       `Testing Procedures` == 1 |
       `Testing Technology`== 1) ~ 1,
        TRUE ~ 0),
    TSBM_testing_public_health = case_when(
      (`Community Testing Services` == 1 |
       `Testing Education Resources` == 1 |
       `Testing Accessibility`== 1|
       `Testing Hesitancy`== 1|
       `Software and Digital Health for Testing`== 1|
       `Public Health Testing Practices`== 1) ~ 1,
        TRUE ~ 0),
    TSBM_testing_economic = case_when(
      (`Testing License Agreements and Patents` == 1 |
       `Testing Non-Profit or Commercial Entities` == 1 |
       `Testing Cost Effectiveness`== 1|
       `Testing Cost Savings`== 1) ~ 1 ,
        TRUE ~ 0),
    TSBM_testing_policy = case_when(
      (`Testing Advisory Activities` == 1 |
       `Testing Policies and Legislation` == 1) ~ 1,
        TRUE ~ 0) )

# now creating extra variables, number of TSBM benefits, pasted together
# benefits they did and binary indicator if did any benefit

TSBM_testing_columns <- c("TSBM_testing_clinical", 
                          "TSBM_testing_public_health",
                          "TSBM_testing_economic",
                          "TSBM_testing_policy")

content_recode  <- content_recode   %>%
  mutate(TSBM_testing_number = apply(content_recode[, TSBM_testing_columns], 1, sum),
         TSBM_testing_binary = 1 * (TSBM_testing_number > 0),
         TSBM_testing_recode = apply(content_recode[, TSBM_testing_columns], 1, which_design, y = 1) ,
         TSBM_testing_recode = case_when(TSBM_testing_recode == "" ~ "no_TSBM_vaccines",
                                          TRUE ~ TSBM_testing_recode))

# quick check:
print(content_recode[, c(TSBM_testing_columns, 
                         "TSBM_testing_binary", 
                         "TSBM_testing_recode", 
                         "TSBM_testing_number")],
      width=1000)

## # A tibble: 25 × 7
##    TSBM_testing_clinical TSBM_testing_public_health TSBM_testing_economic
##                    <dbl>                      <dbl>                 <dbl>
##  1                     0                          0                     0
##  2                     0                          0                     0
##  3                     0                          0                     0
##  4                     0                          1                     0
##  5                     0                          0                     0
##  6                     0                          1                     1
##  7                     0                          1                     0
##  8                     0                          0                     0
##  9                     0                          0                     0
## 10                     0                          0                     0
##    TSBM_testing_policy TSBM_testing_binary
##                  <dbl>               <dbl>
##  1                   0                   0
##  2                   0                   0
##  3                   0                   0
##  4                   0                   1
##  5                   0                   0
##  6                   0                   1
##  7                   0                   1
##  8                   0                   0
##  9                   0                   0
## 10                   0                   0
##    TSBM_testing_recode                              TSBM_testing_number
##    <chr>                                                          <dbl>
##  1 no_TSBM_vaccines                                                   0
##  2 no_TSBM_vaccines                                                   0
##  3 no_TSBM_vaccines                                                   0
##  4 TSBM_testing_public_health                                         1
##  5 no_TSBM_vaccines                                                   0
##  6 TSBM_testing_public_health.TSBM_testing_economic                   2
##  7 TSBM_testing_public_health                                         1
##  8 no_TSBM_vaccines                                                   0
##  9 no_TSBM_vaccines                                                   0
## 10 no_TSBM_vaccines                                                   0
## # … with 15 more rows

table(content_recode$TSBM_testing_binary, 
      content_recode$TSBM_testing_number)

##    
##      0  1  2
##   0 17  0  0
##   1  0  6  2

table(content_recode$TSBM_testing_binary, 
      content_recode$TSBM_testing_recode)

##    
##     no_TSBM_vaccines TSBM_testing_clinical.TSBM_testing_economic
##   0               17                                           0
##   1                0                                           1
##    
##     TSBM_testing_public_health TSBM_testing_public_health.TSBM_testing_economic
##   0                          0                                                0
##   1                          6                                                1

# rename some health variables:
content_recode <- content_recode %>% 
  rename(Neurological = `Neurological disorders (includes Guillain-Barre syndrome)`,
         HIV = `HIV/AIDS`,
         HepC = `Hepatitis C`,
         Diabetes = `Diabetes, Hyperglycemia or Hypoglycemia`)

#reduce to key variables of interest:

vars <- c("pmid", 
          "UP_urban_rural", 
          "type_of_analysis" ,
          "health_equity_addressed", 
          "UP_age",
          "target_pop_children", 
          "target_pop_elders", 
          "UP_SES", 
          "target_pop_SES",
          "UP_HCW", 
          "target_pop_HCW", 
          "UP_teacher", 
          "target_pop_teacher",
          "target_pop_urban", 
          "target_pop_rural",
          "UP_race", 
          "target_race_binary",
          "target_pop_Black", 
          "target_pop_Latinx", 
          "target_pop_Indigenous", 
          "target_pop_Asian",
          "study_site", 
          colnames(design_recode),
          colnames(engage_recode), 
          TSBM_vaccine_columns, 
          "TSBM_vaccines_binary", 
          "TSBM_vaccines_recode", 
          "TSBM_vaccines_number",
          TSBM_testing_columns, 
          "TSBM_testing_binary", 
          "TSBM_testing_recode", 
          "TSBM_testing_number",
          "Neurological",  
          "HIV", 
          "HepC", 
          "Diabetes", 
          "Obesity", 
          "Preeclampsia")

 content_recode <- content_recode %>% 
   select(all_of(vars))
 
################################################################################

# read in (or import) recoded Publication *.csv file:
pubs <- read_csv("2021.09.23 Publications_with_date_variables.csv") %>%
# duplicate PMIDs may be associated more than one project, but we only need them once since we're only interested in the attributes of the paper. if the following variables are distinct (i.e., unique PubMed ID numbers), then keep the values (?):
  distinct(pmid, .keep_all = T)

## Rows: 25 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (29): method, pi_accept, acknow_nih, acknow_radx, acknow_grant, covid, ...
## dbl  (10): proj_num, cycle, year_cite, day, volume, issue, cite_count, pmid,...
## lgl   (2): fund5, fund6
## dttm  (2): date_id, date_update
## date  (1): date_publish
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

pubs <- pubs %>% 
  # select useful columns to keep in the dataframe:
  select(cycle, year_cite, month, day, 
         date_publish, weeks_from_start, months_from_start, pmid, title)

################################################################################

# merge content analysis data onto publication data and export

pub_attributes <- left_join(x = content_recode, y = pubs, by = "pmid")

#only keep publication if on pub-author edgelist?
temp_edges <- read_csv("2021.09.23 Author_Affiliations_no_missing_ids.csv")

## Rows: 209 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): author_name, authors_aff, author_type
## dbl (3): pmid, author_id, author_id2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# sort pub dataframe by pmid variable (?):
pub_attributes <- pub_attributes  %>%
  filter(pmid %in% temp_edges$pmid)

# output (or export) the publication_attributes dataframe you created to a *.csv file:
write_csv(pub_attributes, "2021.09.23 Publication_Attribute_Data.csv")

5. Create Author-Publication Edgelist

# Goal: to merge recoded Author Affiliations *.csv file (see #1.) with recoded Publication *.csv file (see #2.)

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("readxl")
#install.packages("lubridate")

#library(tidyverse)
#library(readxl)
#library(lubridate)

# read in (or import) recoded Author Affiliation *.csv file:
#author_affil <- read_csv("2021.09.23 Author_Affiliations_no_missing_ids.csv")

#select useful columns to keep in the dataframe (i.e., just PMIDs & author IDs):
author_affil <- author_affil %>%
  select(pmid, author_id, author_id2)

# read in (or import) recoded Publication *.csv file and merge key edge attributes on the publication onto the edgelist: 
pubs <- read_csv("2021.09.23 Publications_with_date_variables.csv") %>%
  distinct(pmid, .keep_all = TRUE)

## Rows: 25 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (29): method, pi_accept, acknow_nih, acknow_radx, acknow_grant, covid, ...
## dbl  (10): proj_num, cycle, year_cite, day, volume, issue, cite_count, pmid,...
## lgl   (2): fund5, fund6
## dttm  (2): date_id, date_update
## date  (1): date_publish
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# duplicate PMIDs may be associated more than one project, but we only need them once since we're only interested in the attributes of the paper. if the following variables are distinct (i.e., unique PubMed ID numbers), then keep the values.

#select useful columns to keep in the dataframe:
pubs <- pubs %>% 
  select(proj_num, cycle, year_cite, month, day, date_publish, weeks_from_start, months_from_start, pmid)

# create a new dataframe (i.e., author_pub_edgelist) that uses left_join to map edge attributes from pubs into the edgelist, based on PMIDs as the identifier:
author_pub_edgelist <- left_join(x = author_affil, y = pubs, by = "pmid")

# reorder the columns:
author_pub_edgelist <- author_pub_edgelist %>% 
  select(author_id2, pmid, cycle:months_from_start, author_id)

# only keep pubs that are in the content analysis data set? read in the content analysis data for the publications:
#content <- read_excel("2021.09.23 Content_Analysis-CLEAN.xlsx")

# sort edgelist dataframe by pmid variable (?):
#author_pub_edgelist <- author_pub_edgelist %>% 
#  filter(pmid %in% content$pmid)

# output (or export) the author_pub_edgelist dataframe you created to a *.csv file:
write_csv(author_pub_edgelist, "2021.09.23 Author_Pub_Edgelist.csv")

6. Optional: Create Project-Publication List

# Goal: to output a list showing which projects are associated with each publication (pmid)

# install (for the first time only) and load packages
#install.packages("tidyverse")
#install.packages("readxl")
#install.packages("lubridate")

#library(tidyverse)
#library(readxl)
#library(lubridate)

# read in (or import) recoded Publications *.csv file 
#pubs <- read_csv("2021.09.23 Publications_with_date_variables.csv")

# only keep pubs that are in the content data set? read in the content analysis data for the publications:
#content <- read_excel("2021.09.23 Content_Analysis-CLEAN.xlsx")

# sort pubs dataframe by pmid variable (?):
pubs <- pubs %>% 
  filter(pmid %in% content$pmid)

# create a function (i.e., which_proj) to grab the project number, for a given variable (e.g., pmid):
which_proj <- function(dat, var, id){
    dat$proj_num[dat[, var] == id]
  }

# define unique set of pmids:
pmids <- unique(pubs$pmid)

# create a data frame (?) (i.e., list_of_projects) that grabs all projects associated with a pmid: 
list_of_projects <- lapply(pmids, which_proj, dat = pubs, var = "pmid")
### not sure what to do with missing column: Warning: Unknown or uninitialised column: `proj_num`.

names(list_of_projects) <- pmids

# save out list of projects for each pmid:
save(list_of_projects, file = "2021.09.23 Project_Publication_List.Rdata")

7. Create Co-Authorship Network from the inputs–not the plot yet

# Goal: to construct basic igraph objects with:
# Author_Attribute_Data.csv (see #3.)
# Publication_Attribute_Data.csv (see #4.)
# Publication_Author_Edgelist.csv (see #5.)

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("igraph")

#library(tidyverse)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

# read in (or import) recoded Author Attribute *.csv file:
author_affil_wide <- read_csv("2021.09.23 Author_Attribute_Data.csv",
                  col_types = cols(author_id2 = col_character()))

# read in (or import) recoded Publication Attribute *.csv file and set pmid to character:
pub_attributes <- read_csv("2021.09.23 Publication_Attribute_Data.csv",
                  col_types = cols(pmid = col_character()))

#read in (or import) recoded Edgelist *.csv files and set pmid to character:
author_pub_edgelist <- read_csv("2021.09.23 Author_Pub_Edgelist.csv",
                       col_types = cols(pmid = col_character()))

################################################################################
                       
# create a combined data set of publication and author attributes 
# with shared identifier variables (i.e., id, node) in both dataframes

# publications: copy the PMID as the IDs:
pub_attributes <- pub_attributes  %>%
  mutate(id = pmid, node = "publication")

# authors: copy the author_id2 as the IDs:
author_affil_wide <- author_affil_wide  %>%
  mutate(id = author_id2, node = "author")

# define the attributes that are shared across the two dataframes (i.e., ids, type):
shared_identifiers <- c("id", "node")

# append or create a single, shared dataframe, stacking author attributes on top of publication attributes (i.e., id, node)
stacked.df <- bind_rows(author_affil_wide %>% select(all_of(shared_identifiers)), 
                    pub_attributes %>% select(all_of(shared_identifiers)) )

# create a temporary dataframe with author-specific attributes: put NAs for the publications:
temp_pub_data <- author_affil_wide[1, ]
temp_pub_data[1, ] <- NA
temp_pub_data_NA <- tibble(temp_pub_data, .rows = nrow(pub_attributes))

# append author and publication data for author-specific attributes,
# excluding the shared attributes:
author_specific <- bind_rows(author_affil_wide %>% select(-shared_identifiers), 
                             temp_pub_data_NA %>%select(-shared_identifiers))

## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(shared_identifiers)
## 
##   # Now:
##   data %>% select(all_of(shared_identifiers))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

# publication-specific attributes: put NAs for the authors:
temp_author_data <- pub_attributes[1, ]
temp_author_data[1, ] <- NA
temp_author_data_NA <- tibble(temp_author_data, .rows = nrow(author_affil_wide))

pub_specific <- bind_rows(temp_author_data_NA %>% select(-shared_identifiers),
                                  pub_attributes %>% select(-shared_identifiers))

# append stacked dataframe, author specific dataframe, and publication-specific dataframe:
attributes_author_pub  <- bind_cols(stacked.df, author_specific, pub_specific)

# add sender/receiver variables in the edgelist
# sender is author (author_id2), and receiver is publication (pmid):
author_pub_edgelist <- author_pub_edgelist %>% 
  mutate(sender = author_id2, receiver = pmid) %>%
  select(sender, receiver, cycle:author_id, author_id2, pmid)

############################
# Constructing the Network #
############################

# Construct the igraph object, using the edgelist and two attribute files 

# create a 'named' vector (i.e., type), showing if the node is an author (0) or publication (1) (i.e., type_binary),
# with the name of each element set to author_ids/PMIDs 
# will print an addition 0 if author or 1 if publication
attributes_author_pub <- attributes_author_pub %>%
  mutate(type_binary = 1*(node == "publication"))

type <- attributes_author_pub$type_binary
names(type) <- attributes_author_pub$id

# now creating network in igraph, with types set to type and 
# edges grabbed from sender/receiver columns, 
# transpose and make the 2 column matrix (i.e., the sender and receiver columns)into a vector (i.e., edge):
# but first, for some reason, the author_id2 being pulled in are numeric and not character. create a new variable that is character and pull that in for the edges so that it matches the author_id2 in the named vector, type.
author_pub_edgelist <- author_pub_edgelist %>%
  mutate(sender2 = as.character(author_pub_edgelist$sender))

edge <- c(t(author_pub_edgelist %>% 
              select(sender2, receiver)))

#edge_numeric <- as.numeric(factor(edge))
#levels <- levels(factor(edge))

# vector of senders (author_id2) and receivers (PMIDs)

coauthorship_net <- make_bipartite_graph(edges = edge, types = type)
#error: which vertex/node exists in edge, but not type?
# setdiff(edge, type)

# check to make this sure this looks right and grab useful indicator vectors:
type_from_igraph <- V(coauthorship_net)$type
table(type_from_igraph)

## type_from_igraph
## FALSE  TRUE 
##   179    25

#identify which are authors:
is_author_type <- type_from_igraph == FALSE

#identify which are publications:
is_publication_type <- type_from_igraph == TRUE

#put attributes onto object:
coauthorship_net <- set_vertex_attr(graph = coauthorship_net, 
                                   name = "final_type", 
                                   value = attributes_author_pub$final_type)

#getting name of publication as a vertex attribute
coauthorship_net <- set_vertex_attr(graph = coauthorship_net, 
                                    name = "title", 
                                    value = attributes_author_pub$title)

# saving out network and combined attribute file

save(coauthorship_net, attributes_author_pub, 
     file = "2021.09.23 Coauthorship Network igraph.Rdata")

8. Create initial plots - finally!

# Goal: to produce some very initial network images and summary statistics
# with a focus on organizational homophily, or connections # based on author-publication ties, 
# between different types of organizations.

# Questions to answer: 
# How rare/common are cross organizational ties?
# What pattern of cross-organizational ties do we tend to see? 
# Do we ever see ties involving community partners?
# Under what conditions do we see ties involving community partners?

# install (for the first time only) and load packages:
#install.packages("tidyverse")
#install.packages("igraph")
#install.packages("qgraph")

library(tidyverse)
library(igraph)
library(qgraph)

# load (or import) igraph file:
load("2021.09.23 Coauthorship Network igraph.Rdata")

# extract the affiliation matrix:
affil_matrix <- as_incidence_matrix(coauthorship_net, sparse = F)

type <- V(coauthorship_net)$type

#identify which nodes are authors:
is_author_type <- type == FALSE

#identify which nodes are publications:
is_publication_type <- type == TRUE

####################################
# Basic features of network        #
# (to understand and plot network) #
####################################

# conduct component analysis, looking at reachability
components <- components(coauthorship_net)

# identify which components (and thus nodes) are based on single paper, 
# i.e., not so interesting a structure to examine. 
num_pubs <- function(mems, types = type, x) {
  sum(types[which(mems == x)])
}

num_pubs_in_comp <- unlist(lapply(1:components$no, FUN = num_pubs, 
                                         mems = components$membership, types=type))

# a quick check:
# which(components$membership==3)

#############################
# Initial exploratory plots # 
#############################

# Three versions of how we can plot: 
# A) author-publication
# B) author-author
# C) publication-publication

#1. Node size and color in a basic plot of A)

# take off node labels:
V(coauthorship_net)$label <- NA

# set size of all nodes to 6:
#V(coauthorship_net)$size <- 6 

# or make the author nodes smaller than the publication nodes:
V(coauthorship_net)$size[is_author_type] <- 1.75

# make the publication nodes larger than the author nodes:
V(coauthorship_net)$size[is_publication_type] <- 3.75
  
# set the color, by node type: 
# authors (or FALSE) to UNC blue 
unc_blue <- rgb(red = 75, green = 156, blue = 211, 
                alpha = 1 * 255, maxColorValue = 255)
# pubs (or TRUE) to Duke blue
duke_blue <- rgb(red = 0, green = 83, blue = 155,
                 alpha = 1 * 255, maxColorValue = 255)
# set how each node will be colored:
base_color <- case_when(is_author_type ~  unc_blue,
                        is_publication_type ~ duke_blue)

V(coauthorship_net)$color <- base_color

# set node borders equal to fill:
V(coauthorship_net)$frame.color <- V(coauthorship_net)$color

# other possible options: 
# lighten up the edges a bit?
#E(coauthorship_net)$color <- rgb(.5, .5, .5, .25)

# make authors more transparent? 
#V(affil_net96_noisolates)$color[is_student_type] <- rgb(1, 0, 0, .1)

plot(coauthorship_net)

##################################################################

#2. Add a layout and a legend

set.seed(109)
layout1 <- layout_with_fr(coauthorship_net)

# create a blank, placeholder PDF:
pdf(paste("2021.09.23 author_publication_network.pdf")) 

#plot the network with the layout
plot(coauthorship_net, layout = layout1, margin = 0,
     main = "Author-Publication Network, Base Plot")

# for legend with same-size nodes:
#legend("bottomright", col = c(unc_blue, duke_blue), legend = c("Author", "Publication"),
#       pch = 21, pt.bg = c(unc_blue, duke_blue))

# for legend with actual-size nodes: 
size_vec <-sort(unique(V(coauthorship_net)$size))
scaled <- 1 + ((2-1) * (size_vec - min(size_vec) ) / (  max(size_vec) - min(size_vec) ) )
legend("bottomleft", col = c(unc_blue, duke_blue), legend = c("Author", "Publication"),
       pch = 21, pt.bg = c(unc_blue, duke_blue), pt.cex = scaled, cex = .9)

# close off graphic device:
dev.off()

## png 
##   2

##################################################################

#3. Add titles for deep dive figures:
V(coauthorship_net)$vertex.label <- V(coauthorship_net)$name
V(coauthorship_net)$vertex.label[V(coauthorship_net)$type==FALSE] <-NA

plot(coauthorship_net, layout = layout1, margin = 0,
     main = "Author-Publication Network, Base Plot", 
     vertex.label = V(coauthorship_net)$vertex.label,
     vertex.label.cex=.8)

pdf(paste("2021.09.23 author_publication_network_with_labels.pdf"))

#if wanted to try and tweak the layout 'by hand' using ported network package code:

#e <- get.edgelist(coauthorship_net, names=FALSE)
#l <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(coauthorship_net),
#                                       area = 1.5*(vcount(coauthorship_net)^2),
#                                       repulse.rad = (vcount(coauthorship_net)^3.0))

#plot(coauthorship_net, layout = l, margin = 0,
#     main = "Author-Publication Network, Base Plot")

#############################################################

# 4. set the node color, by type of affiliation:
# academic is UNC blue
# community is orange
# government is dark gray
# healthcare is eggplant
# professional is teal
# school spring green (?)
# pub is still duke blue

orange <- rgb(red = 244, green = 126, blue = 40, 
              alpha = 1 * 255, maxColorValue = 255)

dark_gray <- rgb(red = 71, green = 77, blue = 87, 
                 alpha = 1 * 255, maxColorValue = 255)

eggplant <- rgb(red = 81, green = 12, blue = 93, 
                alpha = 1 * 255, maxColorValue = 255)

teal <- rgb(red = 20, green = 169, blue = 162, 
            alpha = 1 * 255, maxColorValue = 255)

spring_green <- rgb(red = 157, green = 200, blue = 88, 
                    alpha = 1 * 255, maxColorValue = 255)

color_org_based <- case_when(V(coauthorship_net)$final_type == "Academic" ~ unc_blue,
                             V(coauthorship_net)$final_type == "Community" ~ orange, 
                             V(coauthorship_net)$final_type == "Government" ~ dark_gray,
                             V(coauthorship_net)$final_type == "Healthcare" ~ eggplant,
                             V(coauthorship_net)$final_type == "Professional" ~ teal,
                             V(coauthorship_net)$final_type == "School" ~ spring_green,
                             is.na(V(coauthorship_net)$final_type)  ~ duke_blue)

V(coauthorship_net)$color <- color_org_based
V(coauthorship_net)$frame.color <- V(coauthorship_net)$color

pdf(paste("2021.09.23 author_publication_network_colored_by_author_affiliation.pdf"))

plot(coauthorship_net, layout = layout1, margin = c(0, .0, 0, 0),
     main = "Author-Publication Network, by Type of Affiliation")        

col_list2 <- c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue)
scaled2 <- c(rep(scaled[1], length(col_list2) - 1), scaled[2])

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School", "Publication"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = scaled2, cex = .80)

dev.off()

## png 
##   2

#############################################################

# 5. Reduce plot to components, with 2 or more publications

#identify which nodes are in components that have less than 2 publications

multi_pubs <-components$membership %in% which(num_pubs_in_comp < 2) 

V(coauthorship_net)$multi_pubs  <- multi_pubs 
V(coauthorship_net)$comp <- components$membership

ids_multi_pubs <- which(multi_pubs)

# create a subgraph of the full network that include nodes in components w/ 2+ pubs: 
coauthorship_net_2pubs <- delete_vertices(coauthorship_net, ids_multi_pubs )

set.seed(113)
layout2 <- layout_with_fr(coauthorship_net_2pubs)

pdf(paste("2021.09.23 author_publication_network_2pubs_in_component.pdf"))

plot(coauthorship_net_2pubs, layout = layout2, margin = 0, 
     main = "Author-Publication Network, Components with >= 2 Publications")        

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School", "Publication"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = scaled2, cex = .80)

dev.off()

## png 
##   2

#############################################################
# B) the author-author network

# construct the one-mode projection:
onemode_nets <- bipartite_projection(coauthorship_net)
# and grab the author-author network:
author_net <- onemode_nets$proj1

# look at the weighted matrix:
author_mat <- as_adjacency_matrix(graph = author_net, attr = "weight", sparse = F)
table(author_mat)

## author_mat
##     0     1     2     3     4 
## 29543  2416    62    18     2

# 1. plot just authors,  with weights
plot(author_net , layout = layout_with_fr, margin = 0,
     edge.width = E(author_net)$weight)

# 2. plot binary network, no weights, and make nodes smaller with lighter borders
V(author_net)$size <- 1
E(author_net)$color <- rgb(.5, .5, .5, .10)

plot(author_net , layout = layout_with_fr, margin = 0,
     edge.width = 1)

# 3. plot the nth largest components
set_number_ofcomponents <- 10
largest_components <- which(components$csize %in% sort(components$csize, decreasing=T)[1:set_number_ofcomponents ])

author_net_largecomponents <- delete_vertices(author_net, 
                                              which( !(V(author_net)$comp %in% largest_components) ))

V(author_net_largecomponents)$size <- 1.5

E(author_net_largecomponents)$color <- rgb(.5, .5, .5, .10)

plot(author_net_largecomponents, layout = layout_with_fr, margin = 0,
     edge.width = 1)

# 4a. look at nodes in component with at least 2 pubs: 
author_net_2pubs <- delete_vertices(author_net, 
                                    which( V(author_net)$multi_pubs) )

set.seed(10)
layout3 <- layout_with_fr(author_net_2pubs)


pdf(paste("2021.09.23 author_author_network_2pubs_in_component.pdf"))

plot(author_net_2pubs, layout = layout3, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, Components with >= 2 Publications")   

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = 1, cex = .80)

dev.off()

## png 
##   2

#############################################################

#alternative layout?
e <- get.edgelist(author_net_2pubs, names=FALSE)

#default:
#l <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(author_net_2pubs),
#                                       area = 1*(vcount(author_net_2pubs)^2),
#                                       repulse.rad = (vcount(author_net_2pubs)^3.0))

#increasing area a bit: 
l <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(author_net_2pubs),
                                       area = 2.7*(vcount(author_net_2pubs)^2),
                                       repulse.rad = (vcount(author_net_2pubs)^3.0))

pdf(paste("2021.09.23 author_author_network_2pubs_in_component_alt_layout.pdf"))

plot(author_net_2pubs, layout = l, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, Components with >= 2 Publications")   

legend("bottomright", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = 1, cex = .80)

dev.off()

## png 
##   2

#############################################################

# 4b. look at nodes in component with at least 2 pubs (one color for nodes): 
pdf(paste("2021.09.23 author_author_network_2pubs_in_component_one_color.pdf"))

  plot(author_net_2pubs, layout = l, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, Components with >= 2 Publications",
     vertex.color = unc_blue, vertex.frame.color = unc_blue)   

dev.off()

## png 
##   2

#############################################################

# 5. plot all nodes with alternative layout:
e <- get.edgelist(author_net, names=FALSE)

l2 <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(author_net),
                                       area = 2.7*(vcount(author_net)^2),
                                       repulse.rad = (vcount(author_net)^3.0))

pdf(paste("2021.09.23 author_author_network_alt_layout.pdf"))

plot(author_net, layout = l2, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, Colored by Organization")   

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = 1, cex = .80)

dev.off()

## png 
##   2

#############################################################

#same thing but one color
pdf(paste("2021.09.23 author_author_network_alt_layout_one_color.pdf"))

plot(author_net, layout = l2, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, Colored by Organization",
     vertex.color = unc_blue, vertex.frame.color = unc_blue)   

dev.off()

## png 
##   2

#############################################################

# 6. Just the biggest component? 

biggest_comp <- which(components$csize == max(components$csize))

author_net_maincomponent <- delete_vertices(author_net, 
                                            which( V(author_net)$comp != biggest_comp) )

V(author_net_maincomponent)$size <- 1.5

E(author_net_maincomponent)$color <- rgb(.5, .5, .5, .10)


set.seed(113)
layout4 <- layout_with_fr(author_net_maincomponent)

pdf(paste("2021.09.23 author_author_network_largest_component.pdf"))

plot(author_net_maincomponent, layout = layout4, margin = 0,
     edge.width = 1, main = "Author-Author Network, Largest Component")   

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = 1, cex = .80)

dev.off()

## png 
##   2

#############################################################

# 7. Include just repeat interactions, so coauthors at least twice as edge, remove all others

# keep those edges that have weights>1:
author_net_repeat <- delete_edges(author_net, which(E(author_net)$weight<2))

# delete any authors with no ties:
author_net_repeat <- delete_vertices(author_net_repeat, 
                                  which(degree(author_net_repeat)<1))

# plot:
#E(author_net_repeat)$color <- rgb(.5, .5, .5, .10)

e <- get.edgelist(author_net_repeat, names=FALSE)

#default:
#l <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(author_net_2pubs),
#                                       area = 1*(vcount(author_net_2pubs)^2),
#                                       repulse.rad = (vcount(author_net_2pubs)^3.0))

#increasing area a bit: 
l3 <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(author_net_repeat),
                                        area = 2.7*(vcount(author_net_repeat)^2),
                                        repulse.rad = (vcount(author_net_repeat)^3.0))

E(author_net_repeat)$color = "light gray"
V(author_net_repeat)$size = 2

pdf(paste("2021.09.23 author_author_network_repeat_collaborations.pdf"))

plot(author_net_repeat ,  margin = 0,
     edge.width = 1, layout = l3)        

legend("bottomleft", col = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green), 
       legend = c("Academic", "Community", "Government", "Healthcare", "Professional", "School"),
       pch = 21, pt.bg = c(unc_blue, orange, dark_gray, eggplant, teal, spring_green, duke_blue),
       pt.cex = 1, cex = .80)

dev.off()

## png 
##   2

#############################################################

#8. plot author-author network, by publications:
author_net

## IGRAPH 78b4f22 UNW- 179 1249 -- 
## + attr: name (v/c), final_type (v/c), title (v/c), label (v/l), size
## | (v/n), color (v/c), frame.color (v/c), vertex.label (v/c), multi_pubs
## | (v/l), comp (v/n), weight (e/n), color (e/c)
## + edges from 78b4f22 (vertex names):
##  [1] 57220026426--35452136000 57220026426--57220023616 57220026426--57220028740
##  [4] 57220026426--57220036794 57220026426--57220038353 57220026426--57201635827
##  [7] 35452136000--57220023616 35452136000--57220028740 35452136000--57220036794
## [10] 35452136000--57220038353 35452136000--57201635827 35452136000--36850389900
## [13] 35452136000--7006733030  35452136000--57226121039 35452136000--35737232200
## [16] 35452136000--57226101415 35452136000--7201375470  35452136000--57198326551
## + ... omitted several edges

edges_coauthor_net <- as_edgelist(coauthorship_net)

relabel_function <- function(edges_coauthor_net, ids){
  lab <- paste(edges_coauthor_net[edges_coauthor_net[,1] == ids,2], 
               collapse = "_")
}

paper_ids <- unlist(lapply(V(author_net)$name, relabel_function, 
       edges_coauthor_net=edges_coauthor_net))

paper_ids_factor <- factor(paper_ids, labels=1:length(unique(paper_ids )))
V(author_net)$paper_ids_factor  <- paper_ids_factor 

paper_ids_dat <- tibble(paper_ids=paper_ids,
  paper_ids_factor= paper_ids_factor )
  
pdf(paste("2021.09.23 author_network_with_pub_labels.pdf"))

plot(author_net, layout = l2, margin = 0,
     edge.width = 1, 
     main = "Author-Author Network, lables",
     vertex.color = unc_blue, vertex.frame.color = unc_blue,
     vertex.label = paper_ids_factor, vertex.label.cex= .75)   

dev.off()

## png 
##   2

#############################################################

#paper_ids_dat %>% filter(paper_ids_factor == 30) %>% select(paper_ids)
#pubs %>% filter(pmid=="34010526") %>% select("title")