This year Kaggle is launching the second annual Data Science Survey Challenge, where we will be awarding a prize pool of $30,000 to notebook authors who tell a rich story about a subset of the data science and machine learning community. Data and other information you can download here.
# Clear workspace:
rm(list = ls())
# Load package and data:
library(tidyverse)
df_raw <- read_csv("multiple_choice_responses.csv")
# Rename for all columns:
df_raw %>%
slice(1) %>%
as.vector() -> all_columns
str_replace_all(all_columns, " ", "_") -> new_names
df_raw %>% slice(-1) -> df_raw
names(df_raw) <- new_names
#-------------------------------------
# Fact 1: Woman in DS/ML Community
#-------------------------------------
# Select two columns:
df_raw %>%
select(c(3, 5)) %>%
rename(gender = `What_is_your_gender?_-_Selected_Choice`, nation = `In_which_country_do_you_currently_reside?`) -> df_gender_nation
# Top 19 nations + Vietnam by nunber of DS/ML:
df_gender_nation %>%
group_by(nation) %>%
count() %>%
arrange(-n) %>%
ungroup() %>%
slice(1:19) %>%
pull(nation) -> top_19_nations
c(top_19_nations, "Viet Nam") -> top19_vietnam
# Relabel for some countries:
df_gender_nation %>%
filter(nation %in% top19_vietnam, gender %in% c("Male", "Female")) %>%
mutate(nation = case_when(str_detect(nation, "United States") ~ "United States",
str_detect(nation, "Kingdom") ~ "United Kingdom",
TRUE ~ nation)) -> df_gender_20nations
df_gender_20nations %>%
group_by(gender, nation) %>%
count() %>%
ungroup() -> df1
# Calculate female rate:
df1 %>%
spread(key = "gender", value = "n") %>%
mutate(rate = Female / (Female + Male)) %>%
arrange(rate) %>%
mutate(label = round(100*rate, 1)) %>%
mutate(label = as.character(label)) %>%
mutate(label = case_when(!str_detect(label, "\\.") ~ paste0(label, ".0"), TRUE ~ label)) %>%
mutate(label = paste0(label, "%")) %>%
mutate(nation = factor(nation, levels = nation)) -> df_rate
# Re-arrange order by female rate:
df1 %>%
mutate(nation = factor(nation, levels = df_rate$nation)) -> df1_ordered
# Prepare for data visualization:
library(extrafont)
my_colors <- c("#2E74C0", "#CB454A") # Set color.
my_font <- "Roboto Condensed" # Set font.
my_caption <- "Source: 2019 Kaggle ML & DS Survey" # Fix caption.
# Fuction for creating our theme:
my_theme <- function(...) {
theme_minimal() +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.7, 4), "cm")) +
theme(plot.background = element_rect(fill = "#EFF2F4", color = NA)) +
theme(plot.title = element_text(size = 19)) +
theme(plot.subtitle = element_text(size = 11.7, color = "grey30")) +
theme(plot.caption = element_text(size = 10, color = "grey30")) +
theme(axis.text.x = element_text(size = 10.5, color = "grey20")) +
theme(axis.text.y = element_text(size = 10.5, color = "grey20"))
}
# Graph presents fact 1:
df1_ordered %>%
full_join(df_rate, by = "nation") %>%
ggplot(aes(x = nation, y = n, fill = gender)) +
geom_col(position = "fill") +
coord_flip() +
geom_text(aes(x = nation, y = 0.97, label = label), size = 3.8, color = "white", family = my_font) +
scale_fill_manual(name = "", values = c(Male = my_colors[1], Female = my_colors[2]), labels = c("Female", "Male")) +
scale_y_continuous(labels = paste0(seq(0, 100, 25), "%"), expand = c(0, 0)) +
my_theme() +
theme(legend.position = "top") +
guides(fill = guide_legend(reverse = TRUE)) +
theme(panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank()) +
theme(legend.key.height = unit(0.15, "mm"), legend.key.width = unit(5, "mm")) +
labs(x = NULL, y = NULL,
title = "Fact 1: Women in Machine Learning and Data Science Comunity",
subtitle = "There’s still a significant gender gap for data scientists, with 84% of users identifying as males.\nThe United States has a slightly smaller gender gap at 79%, while Japan has a slightly higher one at 90%.",
caption = my_caption)
#--------------------------------
# Fact 2: Age group by gender
#--------------------------------
df_raw %>%
group_by(`What_is_your_age_(#_years)?`, `What_is_your_gender?_-_Selected_Choice`) %>%
count() %>%
ungroup() -> df_age_gender
names(df_age_gender) <- c("age_group", "gender", "n")
df_age_gender %>%
filter(gender %in% c("Male", "Female")) -> df_age_gender
df_age_gender$age_group %>% unique() -> age_groups
df_age_gender %>%
mutate(age_group = factor(age_group, levels = age_groups)) %>%
mutate(n = as.numeric(n)) %>%
mutate(n_new = case_when(gender == "Male" ~ -1*n, TRUE ~ n)) -> df_age_gender
df_age_gender %>%
ggplot(aes(age_group, n_new, fill = gender)) +
geom_col() +
coord_flip() +
my_theme() +
scale_fill_manual(name = "", values = c(Male = my_colors[1], Female = my_colors[2]), labels = c("Female", "Male")) +
theme(legend.position = "top") +
guides(fill = guide_legend(reverse = TRUE)) +
theme(panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank()) +
theme(legend.key.height = unit(0.15, "mm"), legend.key.width = unit(5, "mm")) +
scale_y_continuous(breaks = seq(-4000, 1000, 500), labels = c(seq(4000, 500, -500), seq(0, 1000, 500))) +
theme(panel.grid.major.x = element_line(color = "grey50", linetype = "dotted")) +
labs(x = NULL, y = NULL,
title = "Fact 2: Age Distribution by Gender",
subtitle = "Millennials dominate data science, with 25-29 year olds being the most common age group.",
caption = my_caption)
#----------------------------------------------------
# Fact 3: Popular Platforms for learing DS
#----------------------------------------------------
df_raw %>%
select(contains("On_which_platforms_have_you_begun_or_completed_data_science_courses")) %>%
gather(question, platform) %>%
filter(!is.na(platform)) %>%
group_by(platform) %>%
count() %>%
arrange(-n) %>%
ungroup() %>%
filter(platform != "-1") %>%
slice(1:12) -> df_platform
df_platform %>% pull(platform) -> top12_platform
df_raw %>%
select(3, contains("On_which_platforms_have_you_begun_or_completed_data_science_courses")) %>%
gather(question, platform, -`What_is_your_gender?_-_Selected_Choice`) %>%
filter(!is.na(platform)) -> gender_platforms
names(gender_platforms) <- c("gender", "question", "platform")
gender_platforms %>%
filter(platform %in% top12_platform) %>%
filter(gender %in% c("Male", "Female")) %>%
group_by(gender, platform) %>%
count() %>%
ungroup() %>%
mutate(platform = case_when(str_detect(platform, "Univer") ~ "University",
str_detect(platform, "Kaggle") ~ "Kaggle",
str_detect(platform, "Link") ~ "Linkin",
TRUE ~ platform)) -> gender_platforms_count
df_platform %>%
mutate(platform = case_when(str_detect(platform, "Univer") ~ "University",
str_detect(platform, "Kaggle") ~ "Kaggle",
str_detect(platform, "Link") ~ "Linkin",
TRUE ~ platform)) -> df_platform
gender_platforms_count %>%
mutate(platform = factor(platform, levels = df_platform$platform)) %>%
mutate(n = as.numeric(n)) %>%
mutate(n_new = case_when(gender == "Male" ~ -1*n, TRUE ~ n)) %>%
ggplot(aes(platform, n_new, fill = gender)) +
geom_col() +
coord_flip() +
my_theme() +
scale_fill_manual(name = "", values = c(Male = my_colors[1], Female = my_colors[2]), labels = c("Female", "Male")) +
theme(legend.position = "top") +
guides(fill = guide_legend(reverse = TRUE)) +
theme(panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank()) +
theme(legend.key.height = unit(0.15, "mm"), legend.key.width = unit(5, "mm")) +
scale_y_continuous(breaks = seq(-7500, 1500, 500), labels = c(seq(7500, 0, -500), seq(500, 1500, 500))) +
theme(panel.grid.major.x = element_line(color = "grey50", linetype = "dotted")) +
labs(x = NULL, y = NULL,
title = "Fact 3: Top Sources for Learning Data Science Skills/Courses",
subtitle = "Coursera, Kaggle and Udemy are the most popular sources for learning Data Science.",
caption = my_caption)