Questioning causality on sex, gender and COVID-19, and identifying bias in large-scale data-driven analyses: the Bias Priority Recommendations and Bias Catalog for Pandemics

First of all, we will create a single dataset from available data with all gender related attributes.

Loading first dataset

Source for the 1st dataset: https://ourworldindata.org/coronavirus-source-data

First Dataset Exploration

The first dataset contains information for 141 countries: The available variables are:

names(covid_data)

##  [1] "iso_code"                           "continent"                         
##  [3] "location"                           "date"                              
##  [5] "total_cases"                        "new_cases"                         
##  [7] "new_cases_smoothed"                 "total_deaths"                      
##  [9] "new_deaths"                         "new_deaths_smoothed"               
## [11] "total_cases_per_million"            "new_cases_per_million"             
## [13] "new_cases_smoothed_per_million"     "total_deaths_per_million"          
## [15] "new_deaths_per_million"             "new_deaths_smoothed_per_million"   
## [17] "icu_patients"                       "icu_patients_per_million"          
## [19] "hosp_patients"                      "hosp_patients_per_million"         
## [21] "weekly_icu_admissions"              "weekly_icu_admissions_per_million" 
## [23] "weekly_hosp_admissions"             "weekly_hosp_admissions_per_million"
## [25] "total_tests"                        "new_tests"                         
## [27] "total_tests_per_thousand"           "new_tests_per_thousand"            
## [29] "new_tests_smoothed"                 "new_tests_smoothed_per_thousand"   
## [31] "tests_per_case"                     "positive_rate"                     
## [33] "tests_units"                        "stringency_index"                  
## [35] "population"                         "population_density"                
## [37] "median_age"                         "aged_65_older"                     
## [39] "aged_70_older"                      "gdp_per_capita"                    
## [41] "extreme_poverty"                    "cardiovasc_death_rate"             
## [43] "diabetes_prevalence"                "female_smokers"                    
## [45] "male_smokers"                       "handwashing_facilities"            
## [47] "hospital_beds_per_thousand"         "life_expectancy"                   
## [49] "human_development_index"

Let’s keep only gender-related information and countries with complete data

#Remove rows with NA

female_smokers_has_na <-
  apply(covid_data[, "female_smokers"], 1, function(x) {
    any(is.na(x))
  })
male_smokers_has_na <-
  apply(covid_data[, "male_smokers"], 1, function(x) {
    any(is.na(x))
  })
filtered_covid <-
  covid_data[!(female_smokers_has_na & male_smokers_has_na),]

#Group by country
data_by_coutnry <- filtered_covid %>% group_by(location) %>%
  summarise(
    female_smokers = mean(female_smokers),
    male_smokers = mean(male_smokers)
  )
rmarkdown::paged_table(head(data_by_coutnry))

Load 2nd dataset

Source for the 2nd dataset : https://globalhealth5050.org/the-sex-gender-and-covid-19-project/dataset/

Dataset <- read_excel("C:/Users/bakka/Downloads/Dataset.xlsx")

Second Dataset Exploration

The second dataset contains information for 183 countries: The available variables are:

names(Dataset)

##  [1] "Country code"                                               
##  [2] "Country"                                                    
##  [3] "Case & death data by sex?"                                  
##  [4] "Cases date"                                                 
##  [5] "Cases where sex-disaggregated data is available"            
##  [6] "Cases (% male)"                                             
##  [7] "Cases (% female)"                                           
##  [8] "Deaths date"                                                
##  [9] "Deaths where sex-disaggregated data is available"           
## [10] "Deaths (% male)"                                            
## [11] "Deaths (% female)"                                          
## [12] "Deaths in confirmed cases date"                             
## [13] "Proportion of deaths in confirmed cases (male)"             
## [14] "Proportion of deaths in confirmed cases (female)"           
## [15] "Proportion of deaths in confirmed cases (Male:female ratio)"
## [16] "Source"

Same as previously, keep only gender-related information and countries with complete data

#Keep only useful columns
Dataset_useful <- Dataset[, c(2, 6, 7, 10, 11, 13, 14, 15)]
raw_has_na <- apply(Dataset_useful, 1, function(x) {
  any(is.na(x))
})

#Remove non complete data raws
filtered_covid2 <- Dataset_useful[!raw_has_na,]
colnames(data_by_coutnry)[colnames(data_by_coutnry) == "location"] <-
  "Country"

rmarkdown::paged_table(head(filtered_covid2))

Merge into one dataset

We have complete information for 61 countries after merging

data_all <- merge(data_by_coutnry, filtered_covid2, by = "Country")
rmarkdown::paged_table(head(data_all))

Format final dataset

data_all[, 2:10] <-
  apply(data_all[, 2:10], 2, function(y)
    as.numeric(gsub("%", "", y)))

names(data_all) <-
  c(
    "Country",
    "Smokers_F",
    "Smokers_M",
    "Cases%M",
    "Cases%F",
    "Deaths%M",
    "Deaths%F",
    "ProportionDeathsInConfirmedCases_M",
    "ProportionDeathsInConfirmedCases_F",
    "ProportionDeathsInConfirmedCases_Ratio"
  )

Compute ratio of deaths and ratio of cases from available data

data_with_ratio <- data_all %>%
  mutate(
    ratio_deaths = `Deaths%M` / `Deaths%F`,
    ratio_cases = `Cases%M` / `Cases%F`,
    ratio_Proportion = ProportionDeathsInConfirmedCases_M / ProportionDeathsInConfirmedCases_F,
    ratio_smoking = Smokers_M / Smokers_F
  )

Define 4 groups of countries based on their ratio of cases and ratio of deaths

Group1 <-
  data_with_ratio %>% filter(ratio_cases > 1 & ratio_deaths < 1)
Group2 <-
  data_with_ratio %>% filter(ratio_cases < 1 & ratio_deaths > 1)
Group3 <-
  data_with_ratio %>% filter(ratio_cases < 1 & ratio_deaths < 1)
Group4 <-
  data_with_ratio %>% filter(ratio_cases > 1 & ratio_deaths > 1)

Group1$Country <-
  factor(Group1$Country, levels = Group1$Country[order(Group1$ratio_smoking)])
Group1$Country <-
  factor(Group1$Country, levels = Group1$Country[order(Group1$ratio_deaths)])

Group2$Country <-
  factor(Group2$Country, levels = Group2$Country[order(Group2$ratio_smoking)])
Group2$Country <-
  factor(Group2$Country, levels = Group2$Country[order(Group2$ratio_deaths)])

Group4$Country <-
  factor(Group4$Country, levels = Group4$Country[order(Group4$ratio_smoking)])
Group4$Country <-
  factor(Group4$Country, levels = Group4$Country[order(Group4$ratio_deaths)])

Generate plots for each group of countries

#Group 2
g1 <-
  ggplot(Group2, aes(
    y = log(ratio_deaths),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(0, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 7))

g2 <-
  ggplot(Group2, aes(
    y = log(ratio_smoking),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(-0.05, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 7))

ggpubr::ggarrange(g1, g2)

#Group 3
g1 <-
  ggplot(Group3, aes(
    y = log(ratio_deaths),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(-0.4, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 7))

g2 <-
  ggplot(Group3, aes(
    y = log(ratio_smoking),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(-0.4, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 7))


ggpubr::ggarrange(g1, g2)

#Group 4
g1 <-
  ggplot(Group4, aes(
    y = log(ratio_deaths),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(0, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 9))

g2 <-
  ggplot(Group4, aes(
    y = log(ratio_smoking),
    x = Country,
    fill = Country
  )) +
  coord_flip() +
  geom_bar(stat = "identity", width = .90) +
  xlab("") + # Set axis labels
  ylab("") + ylim(0, 4.5) +
  guides(fill = FALSE) + theme(text = element_text(size = 7))

ggpubr::ggarrange(g1, g2)