# theme
theme = theme_bw() +
theme(plot.title = element_text(face = "bold", size = (15)),
plot.subtitle = element_text(size = (10)),
axis.title = element_text(size = (10))) +
theme(axis.text.x = element_text(angle = 0), legend.position = "none")
# color
# RD = #D43F3AFF
# OG = #EEA236FF
# GN = #5CB85CFF
# LB = #46B8DAFF
# DB = #357EBDFF
# PR = #9632B8FF
# GY = #B8B8B8FF
# data
top.countries = data %>%
group_by(Country) %>%
summarise(Number = n()) %>%
mutate(Perc = Number/sum(Number)) %>%
ungroup() %>%
top_n(20, wt = Number)
# plot
ggplot(data = top.countries,
aes(area = Number,
fill = factor(Country),
label = Country)) +
geom_treemap(show.legend = F) +
geom_treemap_text(fontface = "italic",
color = "black",
place = "centre") +
scale_fill_d3(palette = c("category20"),
alpha = 0.5) +
labs(title = "Treemap from Top 20 Countries with Most Respondents",
x = NULL,
y = NULL)
# plot
ggplot(data = top.countries,
aes(x = reorder(Country, Number),
y = Number)) +
geom_bar(stat = "identity", fill = "#5CB85CFF") +
geom_text(aes(label = paste0(format(Perc*100, digit = 2), "%")),
hjust = -0.05,
vjust = 0.3,
size = 2.5,
color = "black") +
coord_flip() +
theme +
labs(title = "Amount and Percent of Respondents",
x = NULL,
y = "Count")
# data
con.sal = data %>%
filter(Employment == "Employed full-time") %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
mutate(Count = n()) %>%
filter(Count > 500) %>%
summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>%
arrange(desc(m.sal)) %>%
select(Country) %>%
mutate(Country = factor(Country)) %>%
ungroup()
dist.sal = data %>%
filter(Employment == "Employed full-time") %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
mutate(Count = n()) %>%
filter(Count > 500) %>%
ungroup()
# plot
options(scipen = 999)
ggplot(data = dist.sal) +
geom_violin(aes(x = Country,
y = ConvertedSalary),
fill = "#46B8DAFF") +
scale_x_discrete(limits = con.sal$Country) +
scale_y_log10() +
coord_flip() +
theme +
labs(title = "Distribution of Salary",
x = NULL,
y = "Log Converted Salary")
# data
temp = c("Employed part-time",
"Employed full-time",
"Independent contractor, freelancer, or self-employed")
emp.con = data %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
summarise(avg.emp = mean(Employment %in% temp, na.rm = T),
Count = n()) %>%
filter(Count > 500) %>%
ungroup()
# plot
ggplot(data = emp.con,
aes(x = reorder(Country, avg.emp),
y = avg.emp)) +
geom_point(size = 5,
color = "#357EBDFF") +
geom_segment(aes(x = Country,
xend = Country,
y = 0,
yend = avg.emp),
color = "#357EBDFF") +
geom_label(aes(label = paste0(format(avg.emp*100, digit = 3), "%")),
hjust = "inward",
size = 3,
color = "#357EBDFF") +
scale_y_continuous(labels = percent_format()) +
coord_flip() +
theme +
labs(title = "Employment Rate from Top Surveyed Countries",
x = NULL,
y = "Employment Rate")
# data
fm.r = data %>%
filter(!is.na(Country)) %>%
filter(Gender == "Male" | Gender == "Female") %>%
group_by(Country, Gender) %>%
summarise(Count = n()) %>%
spread(Gender, Count) %>%
mutate(F2M = Female/Male,
N = Female + Male) %>%
filter(N > 100) %>%
arrange(desc(F2M)) %>%
head(10) %>%
ungroup()
# plot
p.1 = ggplot(data = fm.r,
aes(x = reorder(Country, F2M),
y = F2M)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "#9632B8FF",
alpha = 0.8) +
geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
size = 4,
color = "#9632B8FF",
hjust = "inward") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_flip() +
theme +
labs(title = "Better F2M",
x = NULL,
y = "Female to Male Ratio")
# data
fm.r = data %>%
filter(!is.na(Country)) %>%
filter(Gender == "Male" | Gender == "Female") %>%
group_by(Country, Gender) %>%
summarise(Count = n()) %>%
spread(Gender, Count) %>%
mutate(F2M = Female/Male,
N = Female + Male) %>%
filter(N > 100) %>%
arrange(F2M) %>%
head(10) %>%
ungroup()
# plot
p.2 = ggplot(data = fm.r,
aes(x = reorder(Country, -F2M),
y = F2M)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "#9632B8FF",
alpha = 0.8) +
geom_label(aes(label = paste0(format(F2M*100, digit = 2), "%")),
size = 4,
color = "#9632B8FF",
hjust = "inward") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_flip() +
theme +
labs(title = "Worse F2M",
x = NULL,
y = "Female to Male Ratio")
# plot
ggarrange(p.1, p.2, nrow = 1)
# data
major.withna = data %>%
group_by(UndergradMajor) %>%
summarise(Count = n()/nrow(data)) %>%
ungroup()
# plot
ggplot(data = major.withna,
aes(x = reorder(UndergradMajor, -Count),
y = Count)) +
geom_point(color = "#357EBDFF", show.legend = F, size = 4) +
geom_segment(aes(x = UndergradMajor,
xend = UndergradMajor,
y = Count,
yend = 0),
size = 1,
color = "#357EBDFF") +
geom_label(aes(label = paste0(format(Count*100, digit = 1), "%")),
hjust = "inward",
size = 3,
color = "#357EBDFF") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_flip() +
theme +
labs(title = "Undergrad Majors with NAs",
x = NULL,
y = NULL)
# data
major.withoutna = data %>%
group_by(UndergradMajor) %>%
filter(!is.na(UndergradMajor)) %>%
summarise(Percent = length(UndergradMajor)) %>%
mutate(pct = prop.table(Percent)) %>%
ungroup()
# plot
ggplot(data = major.withoutna,
aes(x = reorder(UndergradMajor, -pct),
y = pct)) +
geom_point(color = "#9632B8FF", show.legend = F, size = 4) +
geom_segment(aes(x = UndergradMajor,
xend = UndergradMajor,
y = pct,
yend = 0),
size = 1,
color = "#9632B8FF") +
geom_label(aes(label = paste0(format(pct*100, digit = 1), "%")),
hjust = "inward",
size = 3,
color = "#9632B8FF") +
scale_y_continuous(labels = percent_format()) +
coord_flip() +
theme +
labs(title = "Undergrad Majors without NAs",
x = NULL,
y = NULL)
# data
temp = c("Multiple times per day",
"Daily or almost daily",
"A few times per week",
"A few times per month or weekly")
under.grad = data %>%
filter(!is.na(UndergradMajor)) %>%
group_by(UndergradMajor) %>%
summarise(Part = mean(StackOverflowParticipate %in% temp,
na.rm = T),
Memb = mean(StackOverflowConsiderMember == "Yes",
na.rm = T),
Count = n())
# plot
ggplot(data = under.grad,
aes(x = Part,
y = Memb)) +
geom_smooth(method = "lm",
color = "#D43F3AFF",
size = 1.5) +
geom_text_repel(aes(label = UndergradMajor),
size = 3,
point.padding = 0.1,
family = "IBMPlexSans") +
geom_point(aes(size = Count),
alpha = 0.8,
color = "#EEA236FF") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_size_continuous(labels = comma_format()) +
theme +
theme(legend.position = "right") +
labs(title = "Member VS Participation by Undergrad Major",
x = "% of Participation",
y = "% of Who Believes Is A Member")
# data
# set.seed(323)
box.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(ConvertedSalary)) %>%
filter(ConvertedSalary <= quantile(ConvertedSalary, 0.95))
# %>% sample_n(1000)
# plot
options(scipen = 999)
ggplot(data = box.gender,
aes(x = Gender,
y = ConvertedSalary,
color = Gender,
fill = Gender)) +
geom_half_violin(side = "l",
alpha = 0.5,
trim = F) +
geom_half_boxplot(side = "r",
alpha = 0.5,
width = 0.7,
outlier.size = 2) +
# geom_jitter(alpha = 0.3,
# size = 2) +
scale_color_locuszoom() +
scale_fill_locuszoom() +
theme +
labs(title = "Salary by Gender",
x = NULL,
y = "Salary")
# data
sal.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(ConvertedSalary)) %>%
group_by(Gender, Country) %>%
summarise(MS = mean(ConvertedSalary)) %>%
filter(MS > quantile(MS, 0.75)) %>%
ungroup()
# plot
ggplot(data = sal.gender,
aes(x = Gender,
y = Country,
size = MS)) +
geom_point(aes(color = Gender)) +
scale_color_locuszoom() +
theme +
labs(title = "Salary by Gender",
x = NULL,
y = NULL)
# data
con.sal.gen = data %>%
filter(Employment == "Employed full-time") %>%
filter(Gender %in% c("Male", "Female")) %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
mutate(Count = n()) %>%
filter(Count > 100 & Count < 500) %>%
summarise(m.sal = median(ConvertedSalary, na.rm = T)) %>%
arrange(desc(m.sal)) %>%
select(Country) %>%
mutate(Country = factor(Country)) %>%
ungroup()
dist.sal.gen = data %>%
filter(Employment == "Employed full-time") %>%
filter(Gender %in% c("Male", "Female")) %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
mutate(Count = n()) %>%
filter(Count > 100 & Count < 500) %>%
ungroup()
# plot
options(scipen = 999)
ggplot(data = dist.sal.gen) +
geom_boxplot(aes(x = Country,
y = ConvertedSalary,
fill = Gender)) +
scale_x_discrete(limits = con.sal.gen$Country) +
scale_y_log10() +
coord_flip() +
scale_fill_locuszoom() +
theme +
theme(legend.position = "right") +
labs(title = "Salary Distribution by Gender in Countries",
x = NULL,
y = "Log Converted Salary")
# data
edu.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(FormalEducation)) %>%
group_by(Gender, FormalEducation) %>%
summarise(Count = length(Gender)) %>%
mutate(pct = prop.table(Count)) %>%
ungroup()
# plot
ggplot(data = edu.gender,
aes(x = reorder(FormalEducation, pct),
y = pct,
fill = Gender)) +
geom_col(position = "dodge") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 45)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_flip() +
scale_fill_locuszoom() +
theme +
theme(legend.position = "right") +
labs(title = "Formal Education by Gender",
x = NULL,
y = NULL)
# data
hob.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(Hobby)) %>%
group_by(Gender, Hobby) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)) %>%
ungroup()
# plot
ggplot(data = hob.gender,
aes(x = reorder(Hobby, -pct),
y = pct,
fill = Gender)) +
geom_bar(stat = "identity") +
geom_label(aes(label = paste0(format(pct*100, digit = 3), "%")),
size = 4,
fill = "white") +
facet_wrap(~Gender) +
scale_y_continuous(labels = percent_format()) +
scale_fill_locuszoom() +
theme +
# theme(strip.text.x = element_blank()) +
# theme(legend.position = "right") +
labs(title = "Hobby by Gender",
x = "Codes as a Hobby",
y = NULL)
# data
stu.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(Student)) %>%
group_by(Gender, Student) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)) %>%
ungroup()
# plot
ggplot(data = stu.gender,
aes(x = reorder(Student, -pct),
y = pct,
fill = Gender)) +
geom_bar(stat = "identity") +
geom_label(aes(label = paste0(format(pct*100, digit = 2), "%")),
size = 4,
fill = "white") +
scale_y_continuous(labels = percent_format()) +
facet_wrap(~Gender) +
scale_fill_locuszoom() +
theme +
labs(title = "Student Status by Gender",
x = "Student Status",
y = NULL)
# data
age.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(Age)) %>%
group_by(Gender, Age) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)) %>%
ungroup()
# plot
ggplot(data = age.gender,
aes(x = reorder(Age, pct),
y = pct,
fill = Gender)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = percent_format()) +
coord_flip() +
scale_fill_locuszoom() +
theme +
theme(legend.position = "right") +
labs(title = "Age by Gender",
x = NULL,
y = NULL)
# data
job.gender = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(LastNewJob)) %>%
group_by(Gender, LastNewJob) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)) %>%
ungroup()
# plot
ggplot(data = job.gender,
aes(x = reorder(LastNewJob, pct),
y = pct,
fill = Gender)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_flip() +
scale_fill_locuszoom() +
theme +
theme(legend.position = "right") +
labs(title = "Last Job by Gender",
x = NULL,
y = NULL)
# data
cur.lang = data %>%
filter(Gender == "Male" | Gender == "Female") %>%
filter(!is.na(LanguageWorkedWith)) %>%
mutate(LanguageWorkedWith = str_split(LanguageWorkedWith,
pattern = ";")) %>%
select(Gender, LanguageWorkedWith) %>%
unnest(LanguageWorkedWith) %>%
group_by(Gender, LanguageWorkedWith) %>%
summarise(Count = n()) %>%
mutate(Percent = prop.table(Count)*100) %>%
mutate(Percent = round(Percent, 1)) %>%
arrange(desc(Count)) %>%
mutate(LanguageWorkedWith = reorder(LanguageWorkedWith, Count)) %>%
ungroup()
male = cur.lang %>%
filter(Gender == "Male") %>%
arrange(desc(LanguageWorkedWith))
female = cur.lang %>%
filter(Gender == "Female") %>%
arrange(desc(LanguageWorkedWith))
lang.labels = cur.lang %>%
arrange(desc(LanguageWorkedWith))
mfunc = colorRampPalette(c("red", "orange", "blue"))
ffunc = colorRampPalette(c("pink", "purple", "yellow"))
# plot
pyramid.plot(male$Percent,
female$Percent,
labels = unique(lang.labels$LanguageWorkedWith),
top.labels = c("Male", "", "Female"),
main = "Top Current Languages by Gender",
gap = 5,
show.values = T,
lxcol = mfunc(nrow(male)),
rxcol = ffunc(nrow(female)))
wc.m = cur.lang %>%
filter(Gender == "Male") %>%
filter(Count > quantile(Count, 0.5)) %>%
select(LanguageWorkedWith, Count) %>%
wordcloud2(size = 0.5)
saveWidget(wc.m, "m.html", selfcontained = F)
webshot("m.html", "m.png", vwidth = 700, vheight = 500, delay = 5)
wc.f = cur.lang %>%
filter(Gender == "Female") %>%
filter(Count > quantile(Count, 0.5)) %>%
select(LanguageWorkedWith, Count) %>%
wordcloud2(size = 0.5)
saveWidget(wc.f, "f.html", selfcontained = F)
webshot("f.html", "f.png", vwidth = 700, vheight = 500, delay = 5)
# data
lang.pref = data %>%
filter(LanguageWorkedWith == "Python" | LanguageWorkedWith == "R") %>%
filter(!is.na(Country)) %>%
group_by(Country, LanguageWorkedWith) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(30) %>%
ungroup()
# plot
ggplot(data = lang.pref,
aes(x = reorder(Country, Count),
y = Count,
fill = LanguageWorkedWith)) +
geom_bar(stat = "identity") +
facet_wrap(~LanguageWorkedWith) +
coord_flip() +
scale_fill_locuszoom() +
theme +
labs(title = "Python VS R for Language Preference",
x = NULL,
y = "Count")
# data
emp.pie = data %>%
filter(!is.na(Employment)) %>%
group_by(Employment) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)*100) %>%
ungroup()
# plot
ggplot(data = emp.pie,
aes(x = "",
y = pct,
fill = Employment)) +
geom_bar(stat = "identity") +
coord_polar("y",
start = 0) +
scale_fill_locuszoom() +
theme +
theme(axis.line = element_blank(),
axis.text.x = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
legend.position = "right",
plot.title = element_text(hjust = 0.6)) +
labs(title = "Pie Chart by Employment",
x = NULL,
y = NULL)
# data
sal.race = data %>%
filter(!is.na(RaceEthnicity)) %>%
filter(!is.na(ConvertedSalary)) %>%
select(RaceEthnicity, ConvertedSalary) %>%
mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ",")) %>%
unnest(RaceEthnicity) %>%
mutate(RaceEthnicity = str_split(RaceEthnicity, pattern = ";")) %>%
unnest(RaceEthnicity) %>%
group_by(RaceEthnicity) %>%
summarise(avg = mean(ConvertedSalary)) %>%
mutate(per = standardize(avg)) %>%
mutate(per.type = ifelse(per < 0,
"Below Average",
"Above Average"))
sal.race$RaceEthnicity = c("Indigenous Australian",
"Pacific Islander",
"Black African",
"East Asian",
"Hispanic",
"Middle Eastern",
"Native American",
"South Asian",
"White European")
# plot
ggplot(data = sal.race,
aes(x = reorder(RaceEthnicity, per),
y = per)) +
geom_bar(stat = "identity",
aes(fill = per.type),
width = 0.6) +
coord_flip() +
scale_fill_locuszoom(name = "Salary",
labels = c("Above Avg",
"Below Avg")) +
theme +
theme(legend.position = "right") +
labs(title = "Salary by Race Ethnicity",
x = NULL,
y = "Standardized Salary")
# data
comp.size = data %>%
filter(!is.na(CompanySize)) %>%
group_by(CompanySize) %>%
summarise(Count = n()) %>%
mutate(pct = prop.table(Count)*100) %>%
ungroup()
# plot
ggplot(data = comp.size,
aes(x = reorder(CompanySize, Count),
y = Count)) +
geom_bar(stat = "identity",
show.legend = F,
fill = "#5CB85CFF") +
geom_label(aes(label = paste0(format(pct, digit = 2), "%")),
size = 4,
color = "#5CB85CFF") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
coord_flip() +
theme +
labs(title = "Company Size for Respondents",
x = NULL,
y = "Count")
# data
con.aid = data %>%
filter(!is.na(AIDangerous)) %>%
group_by(Country, AIDangerous) %>%
summarise(Count = n()) %>%
ungroup() %>%
top_n(20, wt = Count)
# plot
ggplot(data = con.aid,
aes(x = reorder(AIDangerous, -Count),
y = Count,
fill = Country)) +
geom_bar(stat = "identity") +
facet_wrap(~Country, scales = "free_x") +
coord_flip() +
scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
scale_fill_locuszoom() +
theme +
labs(title = "Top Countries Perceptions to Artificial Intelligence",
x = "AI Dangerous",
y = "Count")
This analysis can help to play with the dataset as a survey in the future. The idea is to set the goal as 1st dimensions, such as by 1st factor. Then, adding up 2nd or 3rd factors to pile up and dice up the plot. We can either know the question after plotting or ask the question before plotting. By processing this with different factors, we will be able to explore data analysis. As for survey analysis, it does not need statistical or mathematical analysis. It just needs the easy plots by setting up factors for bringing up insights which some are predictable and some are novel.