1 Objective

This is an analysis of the Stack Overflow survey dataset

2 Preparation

2.1 Environment

Let us set up the working environment and be ready for the analysis

# theme
theme = theme_bw() +
  theme(plot.title = element_text(face = "bold", size = (15)),
        plot.subtitle = element_text(size = (10)),
        axis.title = element_text(size = (10))) +
  theme(axis.text.x = element_text(angle = 0), legend.position = "none")

# color
# RD = #D43F3AFF
# OG = #EEA236FF
# GN = #5CB85CFF
# LB = #46B8DAFF
# DB = #357EBDFF
# PR = #9632B8FF
# GY = #B8B8B8FF

2.2 Dataset

The dataset has 129 columns and 98,855 rows

data = read_csv("DATA.csv")

3 EDA for Business Insight

3.1 Country

3.1.1 Treemap from Top 20 Countries with Most Respondents

Top 5 countries are US, India, Germany, UK, and Canada in sequence

# data
top.countries = data %>%
  group_by(Country) %>% 
  summarise(Number = n()) %>%
  mutate(Perc = Number/sum(Number)) %>%
  ungroup() %>% 
  top_n(20, wt = Number)

# plot
ggplot(data = top.countries, 
       aes(area = Number, 
           fill = factor(Country), 
           label = Country)) + 
  geom_treemap(show.legend = F) + 
  geom_treemap_text(fontface = "italic", 
                    color = "black", 
                    place = "centre") + 
  scale_fill_d3(palette = c("category20"),
                alpha = 0.5) +
  labs(title = "Treemap from Top 20 Countries with Most Respondents",
       x = NULL,
       y = NULL)

3.1.2 Amount and Percent of Respondents by Country

US (21%), India(14%), Germany(7%), UK (6%), Canada (3%)
Top 5 countries represent 51% for over half of them

# plot
ggplot(data = top.countries,
       aes(x = reorder(Country, Number), 
           y = Number)) +
  geom_bar(stat = "identity", fill = "#5CB85CFF") +
  geom_text(aes(label = paste0(format(Perc*100, digit = 2), "%")),
            hjust = -0.05,
            vjust = 0.3,
            size = 2.5,
            color = "black") +
  coord_flip() +
  theme +
  labs(title = "Amount and Percent of Respondents",
       x = NULL,
       y = "Count")

3.1.3 Distribution of Salary by Country

US, Switzerland, and Israel offer the best
India and China tend to have a larger variance on an offer

# data
con.sal = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 500) %>% 
  summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>% 
  arrange(desc(m.sal)) %>% 
  select(Country) %>%
  mutate(Country = factor(Country)) %>% 
  ungroup()

dist.sal = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 500) %>% 
  ungroup()

# plot
options(scipen = 999)
ggplot(data = dist.sal) +
  geom_violin(aes(x = Country,
                  y = ConvertedSalary),
              fill = "#46B8DAFF") +
  scale_x_discrete(limits = con.sal$Country) +
  scale_y_log10() +
  coord_flip() +
  theme +
  labs(title = "Distribution of Salary",
       x = NULL,
       y = "Log Converted Salary")

3.1.4 Employment Rate from Top Surveyed Countries

Switzerland and South Africa are amongst the highest in employment

# data
temp = c("Employed part-time",
         "Employed full-time",
         "Independent contractor, freelancer, or self-employed")
emp.con = data %>%
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  summarise(avg.emp = mean(Employment %in% temp, na.rm = T),
            Count = n()) %>% 
  filter(Count > 500) %>% 
  ungroup()

# plot
ggplot(data = emp.con,
       aes(x = reorder(Country, avg.emp),
           y = avg.emp)) +
  geom_point(size = 5,
             color = "#357EBDFF") +
  geom_segment(aes(x = Country,
                   xend = Country,
                   y = 0,
                   yend = avg.emp),
               color = "#357EBDFF") +
  geom_label(aes(label = paste0(format(avg.emp*100, digit = 3), "%")),
             hjust = "inward",
             size = 3,
             color = "#357EBDFF") +
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  theme +
  labs(title = "Employment Rate from Top Surveyed Countries",
       x = NULL,
       y = "Employment Rate")

3.1.5 Female to Male Ratio by Country

We take only the countries with more than 100 developers as analyzing
The best female to male ratio is 16.6%, which means roughly 16 girls do as developers per 100 developers in that country
The worst ratio is 1.9%, which means 1 out of 100 is a girl to do as a developer

# data
fm.r = data %>%
  filter(!is.na(Country)) %>% 
  filter(Gender == "Male" | Gender == "Female") %>% 
  group_by(Country, Gender) %>% 
  summarise(Count = n()) %>% 
  spread(Gender, Count) %>% 
  mutate(F2M = Female/Male,
         N = Female + Male) %>% 
  filter(N > 100) %>%
  arrange(desc(F2M)) %>% 
  head(10) %>%
  ungroup()

# plot
p.1 = ggplot(data = fm.r,
       aes(x = reorder(Country, F2M),
           y = F2M)) +
  geom_bar(stat = "identity", 
           width = 0.5, 
           fill = "#9632B8FF",
           alpha = 0.8) +
  geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
             size = 4,
             color = "#9632B8FF",
             hjust = "inward") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Better F2M",
       x = NULL,
       y = "Female to Male Ratio")

# data
fm.r = data %>%
  filter(!is.na(Country)) %>% 
  filter(Gender == "Male" | Gender == "Female") %>% 
  group_by(Country, Gender) %>% 
  summarise(Count = n()) %>% 
  spread(Gender, Count) %>% 
  mutate(F2M = Female/Male,
         N = Female + Male) %>% 
  filter(N > 100) %>%
  arrange(F2M) %>% 
  head(10) %>%
  ungroup()

# plot
p.2 = ggplot(data = fm.r,
       aes(x = reorder(Country, -F2M),
           y = F2M)) +
  geom_bar(stat = "identity", 
           width = 0.5, 
           fill = "#9632B8FF",
           alpha = 0.8) +
  geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
             size = 4,
             color = "#9632B8FF",
             hjust = "inward") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Worse F2M",
       x = NULL,
       y = "Female to Male Ratio")

# plot
ggarrange(p.1, p.2, nrow = 1)

3.2 Major

3.2.1 Undergrad Majors with NAs

NA’s is covered 20% of the undergrad majors

# data
major.withna = data %>%
  group_by(UndergradMajor) %>%
  summarise(Count = n()/nrow(data)) %>% 
  ungroup()

# plot
ggplot(data = major.withna,
       aes(x = reorder(UndergradMajor, -Count),
           y = Count)) +
  geom_point(color = "#357EBDFF", show.legend = F, size = 4) +
  geom_segment(aes(x = UndergradMajor,
                   xend = UndergradMajor,
                   y = Count,
                   yend = 0),
               size = 1,
               color = "#357EBDFF") +
  geom_label(aes(label = paste0(format(Count*100, digit = 1), "%")),
             hjust = "inward",
             size = 3,
             color = "#357EBDFF") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme +
  labs(title = "Undergrad Majors with NAs",
       x = NULL,
       y = NULL)

3.2.2 Undergrad Majors without NAs

Top 3 undergrad majors are computer science, other engineer disciplines, information systems

# data
major.withoutna = data %>%
  group_by(UndergradMajor) %>% 
  filter(!is.na(UndergradMajor)) %>% 
  summarise(Percent = length(UndergradMajor)) %>% 
  mutate(pct = prop.table(Percent)) %>%
  ungroup()

# plot
ggplot(data = major.withoutna,
       aes(x = reorder(UndergradMajor, -pct),
           y = pct)) +
  geom_point(color = "#9632B8FF", show.legend = F, size = 4) +
  geom_segment(aes(x = UndergradMajor,
                   xend = UndergradMajor,
                   y = pct,
                   yend = 0),
               size = 1,
               color = "#9632B8FF") +
  geom_label(aes(label = paste0(format(pct*100, digit = 1), "%")),
             hjust = "inward",
             size = 3,
             color = "#9632B8FF") + 
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  theme +
  labs(title = "Undergrad Majors without NAs",
       x = NULL,
       y = NULL)

3.2.3 Member VS Participation by Undergrad Major

People studying natural science are amongst the group that participates the most in Stack Overflow
People who study computer science feel that they belong more to the community, also the biggest group in this community

# data
temp = c("Multiple times per day",
         "Daily or almost daily",
         "A few times per week",
         "A few times per month or weekly")
under.grad = data %>%
  filter(!is.na(UndergradMajor)) %>% 
  group_by(UndergradMajor) %>%
  summarise(Part = mean(StackOverflowParticipate %in% temp,
                        na.rm = T),
            Memb = mean(StackOverflowConsiderMember == "Yes",
                        na.rm = T),
            Count = n())

# plot
ggplot(data = under.grad,
       aes(x = Part,
           y = Memb)) +
  geom_smooth(method = "lm",
              color = "#D43F3AFF",
              size = 1.5) +
  geom_text_repel(aes(label = UndergradMajor),
                  size = 3,
                  point.padding = 0.1,
                  family = "IBMPlexSans") +
  geom_point(aes(size = Count),
             alpha = 0.8,
             color = "#EEA236FF") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_size_continuous(labels = comma_format()) +
  theme +
  theme(legend.position = "right") +
  labs(title = "Member VS Participation by Undergrad Major",
       x = "% of Participation",
       y = "% of Who Believes Is A Member")

3.3 Gender

3.3.1 Salary by Gender in General

Males tend to have a higher presence compared to female
Both salary structures are the same in males and females, but males tend to have a higher ceiling

# data
# set.seed(323)
box.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  filter(ConvertedSalary <= quantile(ConvertedSalary, 0.95))
  # %>% sample_n(1000)

# plot
options(scipen = 999)
ggplot(data = box.gender,
       aes(x = Gender,
           y = ConvertedSalary,
           color = Gender,
           fill = Gender)) +
  geom_half_violin(side = "l",
                   alpha = 0.5,
                   trim = F) +
  geom_half_boxplot(side = "r",
                    alpha = 0.5,
                    width = 0.7,
                    outlier.size = 2) +
  # geom_jitter(alpha = 0.3,
  #             size = 2) +
  scale_color_locuszoom() +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Salary by Gender",
       x = NULL,
       y = "Salary")

3.3.2 Average Salary by Gender in Countries

Male gets more paid in most of the countries
Female gets only significant more paid in Cyprus

# data
sal.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  group_by(Gender, Country) %>% 
  summarise(MS = mean(ConvertedSalary)) %>% 
  filter(MS > quantile(MS, 0.75)) %>%
  ungroup()

# plot
ggplot(data = sal.gender,
       aes(x = Gender,
           y = Country,
           size = MS)) +
  geom_point(aes(color = Gender)) +
  scale_color_locuszoom() +
  theme +
  labs(title = "Salary by Gender",
       x = NULL,
       y = NULL)

3.3.3 Salary Distribution by Gender in Countries

As for medium-level respondents, which count from 100 to 500, we can see that the salary range is getting bigger and lower

# data
con.sal.gen = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(Gender %in% c("Male", "Female")) %>% 
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 100 & Count < 500) %>% 
  summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>% 
  arrange(desc(m.sal)) %>% 
  select(Country) %>%
  mutate(Country = factor(Country)) %>% 
  ungroup()

dist.sal.gen = data %>%
  filter(Employment == "Employed full-time") %>% 
  filter(Gender %in% c("Male", "Female")) %>%  
  filter(!is.na(Country)) %>% 
  group_by(Country) %>% 
  mutate(Count = n()) %>% 
  filter(Count > 100 & Count < 500) %>% 
  ungroup()

# plot
options(scipen = 999)
ggplot(data = dist.sal.gen) +
  geom_boxplot(aes(x = Country,
                   y = ConvertedSalary,
                   fill = Gender)) +
  scale_x_discrete(limits = con.sal.gen$Country) +
  scale_y_log10() +
  coord_flip() +
  scale_fill_locuszoom() +  
  theme +
  theme(legend.position = "right") +
  labs(title = "Salary Distribution by Gender in Countries",
       x = NULL,
       y = "Log Converted Salary")

3.3.4 Formal Education by Gender

Females tend to have more formal education, such as bachelor’s and master’s degree, than males

# data
edu.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(FormalEducation)) %>% 
  group_by(Gender, FormalEducation) %>% 
  summarise(Count = length(Gender)) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = edu.gender,
       aes(x = reorder(FormalEducation, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 45)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Formal Education by Gender",
       x = NULL,
       y = NULL)

3.3.5 Hobby by Gender

Males code as a hobby more than females

# data
hob.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Hobby)) %>% 
  group_by(Gender, Hobby) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = hob.gender,
       aes(x = reorder(Hobby, -pct),
           y = pct,
           fill = Gender)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(format(pct*100, digit = 3), "%")),
             size = 4,
             fill = "white") +   
  facet_wrap(~Gender) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_locuszoom() +
  theme +
  # theme(strip.text.x = element_blank()) +
  # theme(legend.position = "right") +
  labs(title = "Hobby by Gender",
       x = "Codes as a Hobby",
       y = NULL)

3.3.6 Student Status by Gender

Most respondents are not students
No gender distinction is in student status

# data
stu.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Student)) %>% 
  group_by(Gender, Student) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = stu.gender,
       aes(x = reorder(Student, -pct),
           y = pct,
           fill = Gender)) +
  geom_bar(stat = "identity") +
  geom_label(aes(label = paste0(format(pct*100, digit = 2), "%")),
             size = 4,
             fill = "white") +  
  scale_y_continuous(labels = percent_format()) +  
  facet_wrap(~Gender) +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Student Status by Gender",
       x = "Student Status",
       y = NULL)

3.3.7 Age by Gender

Most respondents are young from 10’s, 20’s and 30’s
Females tend to have a younger group than males

# data
age.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(Age)) %>% 
  group_by(Gender, Age) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = age.gender,
       aes(x = reorder(Age, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Age by Gender",
       x = NULL,
       y = NULL)

3.3.8 Last Job by Gender

Females last job is less than a year ago compared to males
Males last job is more than 4 years ago compared to females

# data
job.gender = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(LastNewJob)) %>% 
  group_by(Gender, LastNewJob) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)) %>% 
  ungroup()

# plot
ggplot(data = job.gender,
       aes(x = reorder(LastNewJob, pct),
           y = pct,
           fill = Gender)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  theme(legend.position = "right") +
  labs(title = "Last Job by Gender",
       x = NULL,
       y = NULL)

3.3.9 Top Current Languages by Gender

Top 3 languages used by both genders are Javascript, HTML, and CSS
Top 3 languages are used as web designing

# data
cur.lang = data %>%
  filter(Gender == "Male" | Gender == "Female") %>% 
  filter(!is.na(LanguageWorkedWith)) %>% 
  mutate(LanguageWorkedWith = str_split(LanguageWorkedWith,
                                        pattern = ";")) %>% 
  select(Gender, LanguageWorkedWith) %>%
  unnest(LanguageWorkedWith) %>% 
  group_by(Gender, LanguageWorkedWith) %>% 
  summarise(Count = n()) %>% 
  mutate(Percent = prop.table(Count)*100) %>%
  mutate(Percent = round(Percent, 1)) %>% 
  arrange(desc(Count)) %>% 
  mutate(LanguageWorkedWith = reorder(LanguageWorkedWith, Count)) %>% 
  ungroup()

male = cur.lang %>% 
  filter(Gender == "Male") %>%
  arrange(desc(LanguageWorkedWith))
female = cur.lang %>% 
  filter(Gender == "Female") %>% 
  arrange(desc(LanguageWorkedWith))
lang.labels = cur.lang %>% 
  arrange(desc(LanguageWorkedWith))

mfunc = colorRampPalette(c("red", "orange", "blue"))
ffunc = colorRampPalette(c("pink", "purple", "yellow"))

# plot
pyramid.plot(male$Percent,
             female$Percent,
             labels = unique(lang.labels$LanguageWorkedWith),
             top.labels = c("Male", "", "Female"),
             main = "Top Current Languages by Gender",
             gap = 5,
             show.values = T,
             lxcol = mfunc(nrow(male)),
             rxcol = ffunc(nrow(female)))

3.3.10 Top Current Languages by Gender in Wordcloud

This is part of the male section

wc.m = cur.lang %>% 
  filter(Gender == "Male") %>% 
  filter(Count > quantile(Count, 0.5)) %>% 
  select(LanguageWorkedWith, Count) %>% 
  wordcloud2(size = 0.5)
saveWidget(wc.m, "m.html", selfcontained = F)
webshot("m.html", "m.png", vwidth = 700, vheight = 500, delay = 5)

This is part of the female section

wc.f = cur.lang %>% 
  filter(Gender == "Female") %>% 
  filter(Count > quantile(Count, 0.5)) %>% 
  select(LanguageWorkedWith, Count) %>% 
  wordcloud2(size = 0.5)
saveWidget(wc.f, "f.html", selfcontained = F)
webshot("f.html", "f.png", vwidth = 700, vheight = 500, delay = 5)

3.4 Language

3.4.1 Python VS R for Language Preference

Top 5 countries with most respondents use more Python than R
US and Germany use R more considerably
US, India, and UK use Python more considerably

# data
lang.pref = data %>%
  filter(LanguageWorkedWith == "Python" | LanguageWorkedWith == "R") %>%
  filter(!is.na(Country)) %>% 
  group_by(Country, LanguageWorkedWith) %>% 
  summarise(Count = n()) %>% 
  arrange(desc(Count)) %>% 
  head(30) %>% 
  ungroup()

# plot
ggplot(data = lang.pref,
       aes(x = reorder(Country, Count),
           y = Count,
           fill = LanguageWorkedWith)) +
  geom_bar(stat = "identity") +
  facet_wrap(~LanguageWorkedWith) +
  coord_flip() +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Python VS R for Language Preference",
       x = NULL,
       y = "Count")

3.5 Employment

3.5.1 Pie Chart by Employmen

Most respondents as almost 75% of them are employed fulltime

# data
emp.pie = data %>%
  filter(!is.na(Employment)) %>% 
  group_by(Employment) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)*100) %>% 
  ungroup()

# plot
ggplot(data = emp.pie,
       aes(x = "",
           y = pct,
           fill = Employment)) +
  geom_bar(stat = "identity") +
  coord_polar("y",
              start = 0) +
  scale_fill_locuszoom() +
  theme +
  theme(axis.line = element_blank(),
        axis.text.x = element_blank(),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.position = "right",
        plot.title = element_text(hjust = 0.6)) +
  labs(title = "Pie Chart by Employment",
       x = NULL,
       y = NULL)

3.6 Race

3.6.1 Salary by Race Ethnicity

American, European, and Oceania have above average paid
Asian, African, and Hispanic have below average paid

# data
sal.race = data %>%
  filter(!is.na(RaceEthnicity)) %>% 
  filter(!is.na(ConvertedSalary)) %>% 
  select(RaceEthnicity, ConvertedSalary) %>%
  mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ",")) %>% 
  unnest(RaceEthnicity) %>% 
  mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ";")) %>% 
  unnest(RaceEthnicity) %>% 
  group_by(RaceEthnicity) %>%
  summarise(avg = mean(ConvertedSalary)) %>% 
  mutate(per = standardize(avg)) %>% 
  mutate(per.type = ifelse(per < 0,
                           "Below Average",
                           "Above Average"))

sal.race$RaceEthnicity = c("Indigenous Australian",
                           "Pacific Islander",
                           "Black African",
                           "East Asian",
                           "Hispanic",
                           "Middle Eastern",
                           "Native American",
                           "South Asian",
                           "White European")

# plot
ggplot(data = sal.race,
       aes(x = reorder(RaceEthnicity, per),
           y = per)) +
  geom_bar(stat = "identity",
           aes(fill = per.type),
           width = 0.6) +
  coord_flip() +
  scale_fill_locuszoom(name = "Salary",
                       labels = c("Above Avg",
                                  "Below Avg")) +
  theme +
  theme(legend.position = "right") +
  labs(title = "Salary by Race Ethnicity",
       x = NULL,
       y = "Standardized Salary")

3.7 Company

3.7.1 Company Size for Respondents

Medium size companies have the most respondents due to having needs and requiring DIY help, which is almost half of them
Large size companies might have their own departments to solve the problem
Small size companies might not need a lot of programming on their business

# data
comp.size = data %>%
  filter(!is.na(CompanySize)) %>% 
  group_by(CompanySize) %>% 
  summarise(Count = n()) %>% 
  mutate(pct = prop.table(Count)*100) %>% 
  ungroup()

# plot
ggplot(data = comp.size,
       aes(x = reorder(CompanySize, Count),
           y = Count)) +
  geom_bar(stat = "identity",
           show.legend = F,
           fill = "#5CB85CFF") +
  geom_label(aes(label = paste0(format(pct, digit = 2), "%")),
             size = 4,
             color = "#5CB85CFF") + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  coord_flip() +
  theme +
  labs(title = "Company Size for Respondents",
       x = NULL,
       y = "Count")

3.8 AI

3.8.1 Top Countries Perceptions to Artificial Intelligence

All top countries except India have a similar idea which is algorithms making important decisions is a worry
Interesting point is US and India perceptions as the exact opposite

# data
con.aid = data %>%
  filter(!is.na(AIDangerous)) %>% 
  group_by(Country, AIDangerous) %>% 
  summarise(Count = n()) %>% 
  ungroup() %>% 
  top_n(20, wt = Count)

# plot
ggplot(data = con.aid,
       aes(x = reorder(AIDangerous, -Count),
           y = Count,
           fill = Country)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Country, scales = "free_x") +
  coord_flip() +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  scale_fill_locuszoom() +
  theme +
  labs(title = "Top Countries Perceptions to Artificial Intelligence",
       x = "AI Dangerous",
       y = "Count")

4 Conclusion

This analysis can help to play with the dataset as a survey in the future. The idea is to set the goal as 1st dimensions, such as by 1st factor. Then, adding up 2nd or 3rd factors to pile up and dice up the plot. We can either know the question after plotting or ask the question before plotting. By processing this with different factors, we will be able to explore data analysis. As for survey analysis, it does not need statistical or mathematical analysis. It just needs the easy plots by setting up factors for bringing up insights which some are predictable and some are novel.

Survey Analysis

Chun-Li Hou

June 28, 2021