1 Objective

2 Preparation

2.1 Environment

  • Let us set up the working environment and be ready for the analysis
# theme
theme = theme_bw() +
  theme(plot.title = element_text(face = "bold", size = (15)),
        plot.subtitle = element_text(size = (10)),
        axis.title = element_text(size = (10))) +
  theme(axis.text.x = element_text(angle = 0), legend.position = "none")

# color
# RD = #D43F3AFF
# OG = #EEA236FF
# GN = #5CB85CFF
# LB = #46B8DAFF
# DB = #357EBDFF
# PR = #9632B8FF
# GY = #B8B8B8FF

2.2 Dataset

  • The dataset has 129 columns and 98,855 rows
data = read_csv("DATA.csv")

3 EDA for Business Insight

3.1 Country

3.1.1 Treemap from Top 20 Countries with Most Respondents

  • Top 5 countries are US, India, Germany, UK, and Canada in sequence
# data
top.countries = data %>%
  group_by(Country) %>% 
  summarise(Number = n()) %>%
  mutate(Perc = Number/sum(Number)) %>%
  ungroup() %>% 
  top_n(20, wt = Number)

# plot
ggplot(data = top.countries, 
       aes(area = Number, 
           fill = factor(Country), 
           label = Country)) + 
  geom_treemap(show.legend = F) + 
  geom_treemap_text(fontface = "italic", 
                    color = "black", 
                    place = "centre") + 
  scale_fill_d3(palette = c("category20"),
                alpha = 0.5) +
  labs(title = "Treemap from Top 20 Countries with Most Respondents",
       x = NULL,
       y = NULL)

3.1.2 Amount and Percent of Respondents by Country

  • US (21%), India(14%), Germany(7%), UK (6%), Canada (3%)
  • Top 5 countries represent 51% for over half of them
# plot
ggplot(data = top.countries,
       aes(x = reorder(Country, Number), 
           y = Number)) +
  geom_bar(stat = "identity", fill = "#5CB85CFF") +
  geom_text(aes(label = paste0(format(Perc*100, digit = 2), "%")),
            hjust = -0.05,
            vjust = 0.3,
            size = 2.5,
            color = "black") +
  coord_flip() +
  theme +
  labs(title = "Amount and Percent of Respondents",
       x = NULL,
       y = "Count")

3.1.3 Distribution of Salary by Country

  • US, Switzerland, and Israel offer the best
  • India and China tend to have a larger variance on an offer
# data
con.sal = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 500) %>% 
  summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>% 
  arrange(desc(m.sal)) %>% 
  select(Country) %>%
  mutate(Country = factor(Country)) %>% 
  ungroup()

dist.sal = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 500) %>% 
  ungroup()

# plot
options(scipen = 999)
ggplot(data = dist.sal) +
  geom_violin(aes(x = Country,
                  y = ConvertedSalary),
              fill = "#46B8DAFF") +
  scale_x_discrete(limits = con.sal$Country) +
  scale_y_log10() +
  coord_flip() +
  theme +
  labs(title = "Distribution of Salary",
       x = NULL,
       y = "Log Converted Salary")

3.1.4 Employment Rate from Top Surveyed Countries

  • Switzerland and South Africa are amongst the highest in employment
# data
temp = c("Employed part-time",
         "Employed full-time",
         "Independent contractor, freelancer, or self-employed")
emp.con = data %>%
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  summarise(avg.emp = mean(Employment %in% temp, na.rm = T),
            Count = n()) %>% 
  filter(Count > 500) %>% 
  ungroup()

# plot
ggplot(data = emp.con,
       aes(x = reorder(Country, avg.emp),
           y = avg.emp)) +
  geom_point(size = 5,
             color = "#357EBDFF") +
  geom_segment(aes(x = Country,
                   xend = Country,
                   y = 0,
                   yend = avg.emp),
               color = "#357EBDFF") +
  geom_label(aes(label = paste0(format(avg.emp*100, digit = 3), "%")),
             hjust = "inward",
             size = 3,
             color = "#357EBDFF") +
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  theme +
  labs(title = "Employment Rate from Top Surveyed Countries",
       x = NULL,
       y = "Employment Rate")  

3.1.5 Female to Male Ratio by Country

  • We take only the countries with more than 100 developers as analyzing
  • The best female to male ratio is 16.6%, which means roughly 16 girls do as developers per 100 developers in that country
  • The worst ratio is 1.9%, which means 1 out of 100 is a girl to do as a developer
# data
fm.r = data %>%
  filter(!is.na(Country)) %>% 
  filter(Gender == "Male" | Gender == "Female") %>% 
  group_by(Country, Gender) %>% 
  summarise(Count = n()) %>% 
  spread(Gender, Count) %>% 
  mutate(F2M = Female/Male,
         N = Female + Male) %>% 
  filter(N > 100) %>%
  arrange(desc(F2M)) %>% 
  head(10) %>%
  ungroup()

# plot
p.1 = ggplot(data = fm.r,
       aes(x = reorder(Country, F2M),
           y = F2M)) +
  geom_bar(stat = "identity", 
           width = 0.5, 
           fill = "#9632B8FF",
           alpha = 0.8) +
  geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
             size = 4,
             color = "#9632B8FF",
             hjust = "inward") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Better F2M",
       x = NULL,
       y = "Female to Male Ratio")

# data
fm.r = data %>%
  filter(!is.na(Country)) %>% 
  filter(Gender == "Male" | Gender == "Female") %>% 
  group_by(Country, Gender) %>% 
  summarise(Count = n()) %>% 
  spread(Gender, Count) %>% 
  mutate(F2M = Female/Male,
         N = Female + Male) %>% 
  filter(N > 100) %>%
  arrange(F2M) %>% 
  head(10) %>%
  ungroup()

# plot
p.2 = ggplot(data = fm.r,
       aes(x = reorder(Country, -F2M),
           y = F2M)) +
  geom_bar(stat = "identity", 
           width = 0.5, 
           fill = "#9632B8FF",
           alpha = 0.8) +
  geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
             size = 4,
             color = "#9632B8FF",
             hjust = "inward") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Worse F2M",
       x = NULL,
       y = "Female to Male Ratio")

# plot
ggarrange(p.1, p.2, nrow = 1)

3.2 Major

3.2.1 Undergrad Majors with NAs

  • NA’s is covered 20% of the undergrad majors
# data
major.withna = data %>%
  group_by(UndergradMajor) %>%
  summarise(Count = n()/nrow(data)) %>% 
  ungroup()

# plot
ggplot(data = major.withna,
       aes(x = reorder(UndergradMajor, -Count),
           y = Count)) +
  geom_point(color = "#357EBDFF", show.legend = F, size = 4) +
  geom_segment(aes(x = UndergradMajor,
                   xend = UndergradMajor,
                   y = Count,
                   yend = 0),
               size = 1,
               color = "#357EBDFF") +
  geom_label(aes(label = paste0(format(Count*100, digit = 1), "%")),
             hjust = "inward",
             size = 3,
             color = "#357EBDFF") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Undergrad Majors with NAs",
       x = NULL,
       y = NULL)

3.2.2 Undergrad Majors without NAs

  • Top 3 undergrad majors are computer science, other engineer disciplines, information systems
# data
major.withoutna = data %>%
  group_by(UndergradMajor) %>% 
  filter(!is.na(UndergradMajor)) %>% 
  summarise(Percent = length(UndergradMajor)) %>% 
  mutate(pct = prop.table(Percent)) %>%
  ungroup()

# plot
ggplot(data = major.withoutna,
       aes(x = reorder(UndergradMajor, -pct),
           y = pct)) +
  geom_point(color = "#9632B8FF", show.legend = F, size = 4) +
  geom_segment(aes(x = UndergradMajor,
                   xend = UndergradMajor,
                   y = pct,
                   yend = 0),
               size = 1,
               color = "#9632B8FF") +
  geom_label(aes(label = paste0(format(pct*100, digit = 1), "%")),
             hjust = "inward",
             size = 3,
             color = "#9632B8FF") + 
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  theme +
  labs(title = "Undergrad Majors without NAs",
       x = NULL,
       y = NULL)

3.2.3 Member VS Participation by Undergrad Major

  • People studying natural science are amongst the group that participates the most in Stack Overflow
  • People who study computer science feel that they belong more to the community, also the biggest group in this community
# data
temp = c("Multiple times per day",
         "Daily or almost daily",
         "A few times per week",
         "A few times per month or weekly")
under.grad = data %>%
  filter(!is.na(UndergradMajor)) %>% 
  group_by(UndergradMajor) %>%
  summarise(Part = mean(StackOverflowParticipate %in% temp,
                        na.rm = T),
            Memb = mean(StackOverflowConsiderMember == "Yes",
                        na.rm = T),
            Count = n())

# plot
ggplot(data = under.grad,
       aes(x = Part,
           y = Memb)) +
  geom_smooth(method = "lm",
              color = "#D43F3AFF",
              size = 1.5) +
  geom_text_repel(aes(label = UndergradMajor),
                  size = 3,
                  point.padding = 0.1,
                  family = "IBMPlexSans") +
  geom_point(aes(size = Count),
             alpha = 0.8,
             color = "#EEA236FF") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_size_continuous(labels = comma_format()) +
  theme +
  theme(legend.position = "right") +
  labs(title = "Member VS Participation by Undergrad Major",
       x = "% of Participation",
       y = "% of Who Believes Is A Member")  

3.3 Gender

3.3.1 Salary by Gender in General

  • Males tend to have a higher presence compared to female
  • Both salary structures are the same in males and females, but males tend to have a higher ceiling
# data
# set.seed(323)
box.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  filter(ConvertedSalary <= quantile(ConvertedSalary, 0.95))
  # %>% sample_n(1000)

# plot
options(scipen = 999)
ggplot(data = box.gender,
       aes(x = Gender,
           y = ConvertedSalary,
           color = Gender,
           fill = Gender)) +
  geom_half_violin(side = "l",
                   alpha = 0.5,
                   trim = F) +
  geom_half_boxplot(side = "r",
                    alpha = 0.5,
                    width = 0.7,
                    outlier.size = 2) +
  # geom_jitter(alpha = 0.3,
  #             size = 2) +
  scale_color_locuszoom() +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Salary by Gender",
       x = NULL,
       y = "Salary")

3.3.2 Average Salary by Gender in Countries

  • Male gets more paid in most of the countries
  • Female gets only significant more paid in Cyprus
# data
sal.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  group_by(Gender, Country) %>% 
  summarise(MS = mean(ConvertedSalary)) %>% 
  filter(MS > quantile(MS, 0.75)) %>%
  ungroup()

# plot
ggplot(data = sal.gender,
       aes(x = Gender,
           y = Country,
           size = MS)) +
  geom_point(aes(color = Gender)) +
  scale_color_locuszoom() +
  theme +
  labs(title = "Salary by Gender",
       x = NULL,
       y = NULL)  

3.3.3 Salary Distribution by Gender in Countries

  • As for medium-level respondents, which count from 100 to 500, we can see that the salary range is getting bigger and lower
# data
con.sal.gen = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(Gender %in% c("Male", "Female")) %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 100 & Count < 500) %>% 
  summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>% 
  arrange(desc(m.sal)) %>% 
  select(Country) %>%
  mutate(Country = factor(Country)) %>% 
  ungroup()

dist.sal.gen = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(Gender %in% c("Male", "Female")) %>%  
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 100 & Count < 500) %>% 
  ungroup()

# plot
options(scipen = 999)
ggplot(data = dist.sal.gen) +
  geom_boxplot(aes(x = Country,
                   y = ConvertedSalary,
                   fill = Gender)) +
  scale_x_discrete(limits = con.sal.gen$Country) +
  scale_y_log10() +
  coord_flip() +
  scale_fill_locuszoom() +  
  theme +
  theme(legend.position = "right") +
  labs(title = "Salary Distribution by Gender in Countries",
       x = NULL,
       y = "Log Converted Salary")

3.3.4 Formal Education by Gender

  • Females tend to have more formal education, such as bachelor’s and master’s degree, than males
# data
edu.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(FormalEducation)) %>% 
  group_by(Gender, FormalEducation) %>% 
  summarise(Count = length(Gender)) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = edu.gender,
       aes(x = reorder(FormalEducation, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 45)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Formal Education by Gender",
       x = NULL,
       y = NULL)

3.3.5 Hobby by Gender

  • Males code as a hobby more than females
# data
hob.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Hobby)) %>% 
  group_by(Gender, Hobby) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = hob.gender,
       aes(x = reorder(Hobby, -pct),
           y = pct,
           fill = Gender)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(format(pct*100, digit = 3), "%")),
             size = 4,
             fill = "white") +   
  facet_wrap(~Gender) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_locuszoom() +
  theme +
  # theme(strip.text.x = element_blank()) +
  # theme(legend.position = "right") +
  labs(title = "Hobby by Gender",
       x = "Codes as a Hobby",
       y = NULL)

3.3.6 Student Status by Gender

  • Most respondents are not students
  • No gender distinction is in student status
# data
stu.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Student)) %>% 
  group_by(Gender, Student) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = stu.gender,
       aes(x = reorder(Student, -pct),
           y = pct,
           fill = Gender)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(format(pct*100, digit = 2), "%")),
             size = 4,
             fill = "white") +  
  scale_y_continuous(labels = percent_format()) +  
  facet_wrap(~Gender) +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Student Status by Gender",
       x = "Student Status",
       y = NULL)

3.3.7 Age by Gender

  • Most respondents are young from 10’s, 20’s and 30’s
  • Females tend to have a younger group than males
# data
age.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Age)) %>% 
  group_by(Gender, Age) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = age.gender,
       aes(x = reorder(Age, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Age by Gender",
       x = NULL,
       y = NULL)

3.3.8 Last Job by Gender

  • Females last job is less than a year ago compared to males
  • Males last job is more than 4 years ago compared to females
# data
job.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(LastNewJob)) %>% 
  group_by(Gender, LastNewJob) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = job.gender,
       aes(x = reorder(LastNewJob, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Last Job by Gender",
       x = NULL,
       y = NULL)

3.3.9 Top Current Languages by Gender

  • Top 3 languages used by both genders are Javascript, HTML, and CSS
  • Top 3 languages are used as web designing
# data
cur.lang = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(LanguageWorkedWith)) %>% 
  mutate(LanguageWorkedWith = str_split(LanguageWorkedWith,
                                        pattern = ";")) %>% 
  select(Gender, LanguageWorkedWith) %>%
  unnest(LanguageWorkedWith) %>% 
  group_by(Gender, LanguageWorkedWith) %>% 
  summarise(Count = n()) %>% 
  mutate(Percent = prop.table(Count)*100) %>%
  mutate(Percent = round(Percent, 1)) %>% 
  arrange(desc(Count)) %>% 
  mutate(LanguageWorkedWith = reorder(LanguageWorkedWith, Count)) %>% 
  ungroup()

male = cur.lang %>% 
  filter(Gender == "Male") %>%
  arrange(desc(LanguageWorkedWith))
female = cur.lang %>% 
  filter(Gender == "Female") %>% 
  arrange(desc(LanguageWorkedWith))
lang.labels = cur.lang %>% 
  arrange(desc(LanguageWorkedWith))

mfunc = colorRampPalette(c("red", "orange", "blue"))
ffunc = colorRampPalette(c("pink", "purple", "yellow"))

# plot
pyramid.plot(male$Percent,
             female$Percent,
             labels = unique(lang.labels$LanguageWorkedWith),
             top.labels = c("Male", "", "Female"),
             main = "Top Current Languages by Gender",
             gap = 5,
             show.values = T,
             lxcol = mfunc(nrow(male)),
             rxcol = ffunc(nrow(female)))

3.3.10 Top Current Languages by Gender in Wordcloud

  • This is part of the male section
wc.m = cur.lang %>% 
  filter(Gender == "Male") %>% 
  filter(Count > quantile(Count, 0.5)) %>% 
  select(LanguageWorkedWith, Count) %>% 
  wordcloud2(size = 0.5)
saveWidget(wc.m, "m.html", selfcontained = F)
webshot("m.html", "m.png", vwidth = 700, vheight = 500, delay = 5)

  • This is part of the female section
wc.f = cur.lang %>% 
  filter(Gender == "Female") %>% 
  filter(Count > quantile(Count, 0.5)) %>% 
  select(LanguageWorkedWith, Count) %>% 
  wordcloud2(size = 0.5)
saveWidget(wc.f, "f.html", selfcontained = F)
webshot("f.html", "f.png", vwidth = 700, vheight = 500, delay = 5)

3.4 Language

3.4.1 Python VS R for Language Preference

  • Top 5 countries with most respondents use more Python than R
  • US and Germany use R more considerably
  • US, India, and UK use Python more considerably
# data
lang.pref = data %>%
  filter(LanguageWorkedWith == "Python" | LanguageWorkedWith == "R") %>%
  filter(!is.na(Country)) %>% 
  group_by(Country, LanguageWorkedWith) %>% 
  summarise(Count = n()) %>% 
  arrange(desc(Count)) %>% 
  head(30) %>% 
  ungroup()

# plot
ggplot(data = lang.pref,
       aes(x = reorder(Country, Count),
           y = Count,
           fill = LanguageWorkedWith)) +
  geom_bar(stat = "identity") +
  facet_wrap(~LanguageWorkedWith) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Python VS R for Language Preference",
       x = NULL,
       y = "Count")

3.5 Employment

3.5.1 Pie Chart by Employmen

  • Most respondents as almost 75% of them are employed fulltime
# data
emp.pie = data %>%
  filter(!is.na(Employment)) %>% 
  group_by(Employment) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)*100) %>% 
  ungroup()

# plot
ggplot(data = emp.pie,
       aes(x = "",
           y = pct,
           fill = Employment)) +
  geom_bar(stat = "identity") +
  coord_polar("y",
              start = 0) +
  scale_fill_locuszoom() +
  theme +
  theme(axis.line = element_blank(),
        axis.text.x = element_blank(),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.position = "right",
        plot.title = element_text(hjust = 0.6)) +
  labs(title = "Pie Chart by Employment",
       x = NULL,
       y = NULL)  

3.6 Race

3.6.1 Salary by Race Ethnicity

  • American, European, and Oceania have above average paid
  • Asian, African, and Hispanic have below average paid
# data
sal.race = data %>%
  filter(!is.na(RaceEthnicity)) %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  select(RaceEthnicity, ConvertedSalary) %>%
  mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ",")) %>% 
  unnest(RaceEthnicity) %>% 
  mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ";")) %>% 
  unnest(RaceEthnicity) %>% 
  group_by(RaceEthnicity) %>%
  summarise(avg = mean(ConvertedSalary)) %>% 
  mutate(per = standardize(avg)) %>% 
  mutate(per.type = ifelse(per < 0,
                           "Below Average",
                           "Above Average"))

sal.race$RaceEthnicity = c("Indigenous Australian",
                           "Pacific Islander",
                           "Black African",
                           "East Asian",
                           "Hispanic",
                           "Middle Eastern",
                           "Native American",
                           "South Asian",
                           "White European")

# plot
ggplot(data = sal.race,
       aes(x = reorder(RaceEthnicity, per),
           y = per)) +
  geom_bar(stat = "identity",
           aes(fill = per.type),
           width = 0.6) +
  coord_flip() +
  scale_fill_locuszoom(name = "Salary",
                       labels = c("Above Avg",
                                  "Below Avg")) +
  theme +
  theme(legend.position = "right") +
  labs(title = "Salary by Race Ethnicity",
       x = NULL,
       y = "Standardized Salary")

3.7 Company

3.7.1 Company Size for Respondents

  • Medium size companies have the most respondents due to having needs and requiring DIY help, which is almost half of them
  • Large size companies might have their own departments to solve the problem
  • Small size companies might not need a lot of programming on their business
# data
comp.size = data %>%
  filter(!is.na(CompanySize)) %>% 
  group_by(CompanySize) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)*100) %>% 
  ungroup()

# plot
ggplot(data = comp.size,
       aes(x = reorder(CompanySize, Count),
           y = Count)) +
  geom_bar(stat = "identity",
           show.legend = F,
           fill = "#5CB85CFF") +
  geom_label(aes(label = paste0(format(pct, digit = 2), "%")),
             size = 4,
             color = "#5CB85CFF") + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  coord_flip() +
  theme +
  labs(title = "Company Size for Respondents",
       x = NULL,
       y = "Count")

3.8 AI

3.8.1 Top Countries Perceptions to Artificial Intelligence

  • All top countries except India have a similar idea which is algorithms making important decisions is a worry
  • Interesting point is US and India perceptions as the exact opposite
# data
con.aid = data %>%
  filter(!is.na(AIDangerous)) %>% 
  group_by(Country, AIDangerous) %>% 
  summarise(Count = n()) %>% 
  ungroup() %>% 
  top_n(20, wt = Count)

# plot
ggplot(data = con.aid,
       aes(x = reorder(AIDangerous, -Count),
           y = Count,
           fill = Country)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Country, scales = "free_x") +
  coord_flip() +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Top Countries Perceptions to Artificial Intelligence",
       x = "AI Dangerous",
       y = "Count")

4 Conclusion

This analysis can help to play with the dataset as a survey in the future. The idea is to set the goal as 1st dimensions, such as by 1st factor. Then, adding up 2nd or 3rd factors to pile up and dice up the plot. We can either know the question after plotting or ask the question before plotting. By processing this with different factors, we will be able to explore data analysis. As for survey analysis, it does not need statistical or mathematical analysis. It just needs the easy plots by setting up factors for bringing up insights which some are predictable and some are novel.

5 Reference

  1. Developer Survey / 2018 / Stack Overflow
  2. Survey Analysis / 2018 / Janio Martinez Bachmann
  3. GGSCI / 2020 / SCI666