#Cargando las librerías
#install.packages("tidyverse")
#install.packages("grid")
#install.packages("gridExtra")
#install.packages("ggforce")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ggforce)
# STRING PROCESSING
# countries
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"Iran, Islamic Republic of...","Iran")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"I do not wish to disclose my location","Won't disclose")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United States of America","USA")
continents$Country <- str_replace(continents$Country,"Iran, Islamic Republic of...","Iran")
continents$Country <- str_replace(continents$Country,"I do not wish to disclose my location","Won't disclose")
continents$Country <- str_replace(continents$Country,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
continents$Country <- str_replace(continents$Country,"United States of America","USA")
# CONVERT CATEGORICAL DATA TO FACTOR
# age groups
multipleChoice18$Q2 <- factor(multipleChoice18$Q2,
levels = c("18-21","22-24","25-29",
"30-34","35-39","40-44",
"45-49","50-54","55-59",
"60-69","70-79","80+"),
labels = c("18-21","22-24","25-29",
"30-34","35-39","40-44",
"45-49","50-54","55-59",
"60-69","70-79","80+"))
# degree
multipleChoice18$Q4 <- factor(multipleChoice18$Q4,
levels = c("Doctoral degree","Master’s degree","Bachelor’s degree","Professional degree",
"No formal education past high school",
"Some college/university study without earning a bachelor’s degree",
"I prefer not to answer"),
labels = c("PhD","Master","Bachelor","Professional",
"High school","No degree","Won't disclose"))
# undergraduate major
multipleChoice18$Q5 <- factor(multipleChoice18$Q5,
levels = c("Medical or life sciences (biology, chemistry, medicine, etc.)",
"Computer science (software engineering, etc.)",
"Engineering (non-computer focused)",
"Mathematics or statistics",
"A business discipline (accounting, economics, finance, etc.)",
"Environmental science or geology",
"Social sciences (anthropology, psychology, sociology, etc.)",
"Physics or astronomy",
"Information technology, networking, or system administration",
"I never declared a major",
"Other",
"Humanities (history, literature, philosophy, etc.)") ,
labels = c("Medical/life sciences", "Computer science",
"Engineering", "Mathematics/statistics",
"A business discipline", "Physics/astronomy",
"IT/Network/Sys. admin", "No major declared",
"Humanities", "Env. science", "Social sciences", "Other"))
# In what industry is your current employer?
multipleChoice18$Q7 <- factor(multipleChoice18$Q7,
levels = c("Retail/Sales", "I am a student",
"Computers/Technology", "Accounting/Finance",
"Academics/Education",
"Insurance/Risk Assessment","Other",
"Energy/Mining", "Non-profit/Service",
"Marketing/CRM", "Government/Public Service",
"Manufacturing/Fabrication",
"Online Service/Internet-based Services",
"Broadcasting/Communications",
"Medical/Pharmaceutical",
"Online Business/Internet-based Sales",
"Military/Security/Defense",
"Shipping/Transportation",
"Hospitality/Entertainment/Sports"),
labels = c("Retail / Sales", "Student",
"Computers / Technology", "Accounting / Finance",
"Academics / Education",
"Insurance / Risk Assessment","Other",
"Energy / Mining", "Non-profit / Service",
"Marketing / CRM", "Government / Public Service",
"Manufacturing / Fabrication",
"Online Service / Internet-based Services",
"Broadcasting / Communications",
"Medical / Pharmaceutical",
"Online Business / Internet-based Sales",
"Military / Security/Defense",
"Shipping / Transportation",
"Hospitality / Entertainment/Sports"))
# experience in current role
multipleChoice18$Q8 <- factor(multipleChoice18$Q8, levels = c("0-1","1-2","2-3",
"3-4","4-5","5-10",
"10-15","15-20","20-25",
"25-30","30+"))
# yearly compensation
multipleChoice18$Q9 <- factor(multipleChoice18$Q9,
levels = c("I do not wish to disclose my approximate yearly compensation",
"0-10,000","10-20,000","20-30,000","30-40,000",
"40-50,000","50-60,000","60-70,000","70-80,000",
"80-90,000","90-100,000","100-125,000",
"125-150,000","150-200,000","200-250,000",
"250-300,000","300-400,000", "400-500,000","500,000+"),
labels = c("Won't disclose",
"0-10,000","10-20,000","20-30,000","30-40,000",
"40-50,000","50-60,000","60-70,000","70-80,000",
"80-90,000","90-100,000","100-125,000",
"125-150,000","150-200,000","200-250,000",
"250-300,000","300-400,000", "400-500,000","500,000+"))
# time spent coding
multipleChoice18$Q23 <- factor(multipleChoice18$Q23, levels = c("0% of my time",
"1% to 25% of my time",
"25% to 49% of my time",
"50% to 74% of my time",
"75% to 99% of my time",
"100% of my time"),
labels = c("0%","1% to 25%","25% to 49%",
"50% to 74%","75% to 99%","100%"))
# coding experience
multipleChoice18$Q24 <- factor(multipleChoice18$Q24,
levels = c("I have never written code and I do not want to learn",
"I have never written code but I want to learn",
"< 1 year","1-2 years","3-5 years","5-10 years",
"10-20 years","20-30 years","30-40 years", "40+ years") ,
labels = c("I don't write code and don't want to learn",
"I don't write code but want to learn",
"< 1 year", "1-2 years", "3-5 years",
"5-10 years", "10-20 years","20-30 years","30-40 years", "40+ years")
)
# For how many years have you used machine learning methods
multipleChoice18$Q25 <- factor(multipleChoice18$Q25,
levels = c("I have never studied machine learning and I do not plan to",
"I have never studied machine learning but plan to learn in the future",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-15 years", "20+ years"),
labels = c("Never studied, do not plan to",
"Never studied, plan to learn",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-15 years", "20+ years"))
# use of machine learning in industries
multipleChoice18$Q10 <- factor(multipleChoice18$Q10,
levels = c("I do not know",
"No (we do not use ML methods)",
"We are exploring ML methods (and may one day put a model into production)",
"We recently started using ML methods (i.e., models in production for less than 2 years)",
"We have well established ML methods (i.e., models in production for more than 2 years)",
"We use ML methods for generating insights (but do not put working models into production)"),
labels = c("I do not know", "No", "Exploring ML methods",
"Recently started", "Well established ML methods",
"For generating insights"))
# expertise in data science
multipleChoice18$Q40 <- factor(multipleChoice18$Q40,
levels = c("Independent projects are equally important as academic achievements",
"Independent projects are much more important than academic achievements",
"Independent projects are slightly more important than academic achievements",
"Independent projects are slightly less important than academic achievements",
"Independent projects are much less important than academic achievements",
"No opinion; I do not know"),
labels = c("Equally important",
"Much more important",
"Slightly more important",
"Less important",
"Much less important",
"No opinion/Don't know"))
# are you a data scientist?
multipleChoice18$Q26 <- factor(multipleChoice18$Q26,
levels = c("Definitely yes", "Probably yes", "Maybe",
"Probably not", "Definitely not"),
labels = c("Definitely yes", "Probably yes", "Maybe",
"Probably not", "Definitely not"))
Who are we?
Country of residence
There are 57 countries represented in the survey. Countries with less
than 50 respondents were grouped into the Other category, and some of
the respondents didn’t want to disclose that information.
Out of all the countries represented in the survey, 6 of them are
Africans. In terms of total number of participants, they represent 681
or 2.85% of the total number of respondents. This value is relatively
low, but still places the African continent in the fifth position.
Oceania is then the less represented continent with 407 respondents,
while Asia is the most represented one.
newMultipleChoice %>%
group_by(Continent) %>%
summarise(Count = length(Continent)) %>%
mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
ggplot(aes(x = reorder(Continent,-Count), y = Count, fill = Continent)) +
geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
geom_text(aes(label =as.character(Count)),
position = position_dodge(width = 1),
hjust = 0.5, vjust = -0.25, size = 3) +
scale_fill_brewer(palette = "PuBu") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Number of respondents",
x = "", y = "Count", fill = "",
caption = "Africa and the world")

Compared to the previous year, the number of African respondents show
the significant increase of 109.54%.
p1 <- df %>%
group_by(Country,Year) %>%
summarise(Count = length(Country)) %>%
ggplot(aes(x = Year, y = Count, group = Country)) +
geom_line(aes(color = Country), size = 0.5) +
geom_point(aes(color = Country), size = 4) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_text(size = 12),
legend.position = "bottom",
legend.title=element_blank(),
legend.text = element_text(size = 10)) +
labs(title = "Number of respondents",
x = "", y = "Count", fill = "", caption = "")
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- afroCountries %>%
group_by(Q1,Q3) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
summarise(Count = length(Q3)) %>%
ggplot(aes(x = reorder(Q3,-Count), y = Count, fill = Q1)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 10)) +
labs(title = "Country of residence", x = "", y = "", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2, ncol = 2)

Apart from the incredible surge of respondents from Nigeria, we can
also see the introduction of new respondents from Morocco and
Tunisia.
Gender distribution
If you look at the right handside plot in the previous figure, you
see that the number of women is relatively low compared to the number of
men. This is not surprising since it is the case of all countries.
However, when I compared the female to male ratio of all countries, I
was surprised to see that:
Tunisia has the highest female to male ratio, and
Five of the six African countries present in this survey are
among the top 15 countries with the highest female to male
ratio.
I find this both refreshing and unexpected.
multipleChoice18 %>%
group_by(Q1,Q3) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q3)) %>%
summarise(Count = n()) %>%
spread(Q1,Count) %>%
mutate(ratio = Female/Male) %>%
mutate(highlight_flag = ifelse((Q3 == "Egypt" | Q3 == "Kenya" | Q3 == "Morocco" |
Q3 == "Nigeria" | Q3 == "Tunisia" | Q3 == "South Africa"), T, F)) %>%
ggplot(aes(x = reorder(Q3,-ratio), y = ratio, fill = ratio)) +
geom_bar(aes(fill = highlight_flag), stat = "identity") +
scale_fill_brewer(palette = "Paired") +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 9.5, angle = -90,
hjust = 0 , vjust = 0.5)) +
labs(title = "Female to Male ratio",
x = "", y = "Ratio", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Nigeria has the lowest female to male ratio in Africa, but it has the
highest number of respondents. Consequently, when compared to the other
continents, Africa holds the second place after North America.
newMultipleChoice %>%
group_by(Continent,Q1) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
summarise(Count = n()) %>%
spread(Q1,Count) %>%
mutate(ratio = Female/Male) %>%
mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
ggplot(aes(x = reorder(Continent,-ratio), y = ratio, fill = ratio)) +
geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
scale_fill_brewer(palette = "PuBu") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12)) +
labs(title = "Female to Male ratio",
x = "", y = "Ratio", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Still, this is very promising.
Age distribution
More than half of the respondents are aged between 22 and 29, and
there are no respondent older than 69 years old. Men are in majority
between 22 and 24 years, as opposed to the overall age distribution
where they are mosty in the 25-29 age group. The women are mostly in the
25-29 age group as are the majority of women in all countries.
There are also no women in some age groups.
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
ggplot(data = temp,
aes(x = Q2, fill = Q1)) +
geom_bar(data = filter(temp, Q1 == "Male"), aes(y = Count), stat = "identity") +
geom_bar(data = filter(temp, Q1 == "Female"), aes(y = -1*Count), stat = "identity") +
scale_y_continuous(breaks = seq(-50,150,50),
labels = as.character(c(seq(50,0,-50), seq(50,150,50)))) +
scale_fill_brewer(palette = "Paired") +
coord_flip() +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Age distribution in Africa",
x = "Age group (years)", y = "Count", fill = "",
caption = "")

Around 75% of Afrikagglers are younger than 29 years
old.
Educational background
Degree
Africa is the only continent where Bachelor is the most earned
degree. In all the other continents, Master degree is the one,
especially in Europe where PhD degree holders is also the highest.
p1 <- newMultipleChoice %>%
group_by(Continent,Q4) %>%
filter(!is.na(Q4)) %>%
summarise(Count = length(Continent)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = Q4, y = pct, group = Continent)) +
geom_line(aes(color = Continent), size = 0.5) +
geom_point(aes(color = Continent), size = 2) +
scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Educational background",
x = "", y = "%", fill = "",
caption = "")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
p2 <- afroCountries %>%
group_by(Q1,Q4) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q4)) %>%
summarise(Count = length(Q4)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = "", y = pct, fill = Q4)) +
geom_col(width = 1) +
scale_fill_brewer(palette = "Set3") +
facet_grid(Q1~.) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
legend.text = element_text(size = 11)) +
labs(title = "Degree",
x = "", y = "", fill = "Degree",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2,ncol = 2)

But when you look closely in Africa, the percentage of women earning
Master and Doctoral degrees is higher than that of the men. In addition,
only a few percentage of them have no degree at all, which is not the
case for the men.
Undergraduate major
The most popular undergraduate degree of Kagglers is
Computer science. The two other majors are Mathematics/Statistics and
Engineering.
newMultipleChoice %>%
group_by(Continent,Q5) %>%
filter(!is.na(Q5)) %>%
summarise(Count = length(Q5)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot() +
geom_point(mapping = aes(x = Continent, y = reorder(Q5,pct),
size = pct, color = Q5)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 12),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Undergraduate major",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

In Africa, the percentage of female respondents with a Computer
science background is higher than that of the male respondents. The same
goes with Mathematics/Statistics. So, good job, ladies!
Also, a higher percentage of men haven’t declared any major. This may
be due to the fact that some of them have “no degree”.
afroCountries %>%
group_by(Q1,Q5) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q5)) %>%
summarise(Count = length(Q5)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = reorder(Q5,-pct), y = pct, group = Q1)) +
geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
scale_fill_brewer(palette = "Set3") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 12, angle = -90,hjust = 0,vjust = 0.5),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Undergraduate major",
x = "", y = "%", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Professional experience
Current role and Industry
Africa and Asia have the highest proportion of students. In
Europe, North America and South America, the highest proportion of
respondents are working in data science.
newMultipleChoice %>%
group_by(Continent,Q6) %>%
filter(!is.na(Q6)) %>%
summarise(Count = length(Continent)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot() +
geom_point(mapping = aes(x = Continent, y = reorder(Q6,pct),
size = 5*pct, color = Q6)) +
scale_x_discrete(labels = function(x) str_wrap(x,width = 8)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.y = element_text(size = 10),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Current role",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Around 25% of the respondents are students. Then, the most
represented jobs are data scientists, data analysts and software
engineers.
afroCountries %>%
group_by(Q1,Q6) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q6)) %>%
summarise(Count = length(Q6)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = reorder(Q6,-pct), y = pct, group = Q1)) +
geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Current role", x = "", y = "%", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

In this figure, you can see that the percentage of male students is
higher than the percentage of female students. We can also see that
women suffer less of unemployment.
Most of the Data journalists are from Tunisia and Nigeria. Most of
the Data scientists are located in South Africa.
afroCountries %>%
group_by(Q6,Q3) %>%
filter(!is.na(Q6)) %>%
summarise(Count = length(Q6)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q6, y = Q3, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 11)) +
labs(title = "Current role by country",
x = "", y = "", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

I would also like to investigate the origin of the increase of
African respondents from last year to this year.
I assumed that it all came from students since the women who
responded to the 2017 survey were exclusively students. However, after
plotting a pie chart of students versus non students participants, it
looks like the percentage of students has decreased this year for both
gender. It is now lower than all the other occupations combined.
propStud17 <- afroCountries17 %>%
group_by(GenderSelect,StudentStatus) %>%
filter(GenderSelect == "Female" | GenderSelect == "Male") %>%
filter(!is.na(StudentStatus)) %>%
summarise(Count = length(StudentStatus)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = "", y = pct, fill = StudentStatus)) +
geom_col(width = 1) +
coord_polar("y", start = pi / 3) +
scale_fill_brewer(palette = "Paired") +
facet_wrap(GenderSelect~.) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "top") +
labs(title = "Proportion of students", subtitle = "2017", x = "", y = "", fill = "",
caption = "")
## `summarise()` has grouped output by 'GenderSelect'. You can override using the
## `.groups` argument.
propStud <- propStud18 %>%
group_by(Q1,Q6) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q6)) %>%
summarise(Count = length(Q6)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = "", y = pct, fill = Q6)) +
geom_col(width = 1) +
coord_polar("y", start = pi / 3) +
scale_fill_brewer(palette = "Paired") +
facet_wrap(Q1~.) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "top") +
labs(title = "Proportion of students", subtitle = "2018", x = "", y = "", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(propStud, ncol = 2)

This means one thing: Professionals are embracing the data
science life.
The increase comes essentially from data scientists, data analysts
and software engineers.
# 2017
p1 <- afroCountries17 %>%
group_by(CurrentJobTitleSelect) %>%
filter(!is.na(CurrentJobTitleSelect)) %>%
summarise(Count = length(CurrentJobTitleSelect)) %>%
ggplot(aes(x = reorder(CurrentJobTitleSelect, Count), y = Count, fill = CurrentJobTitleSelect)) +
geom_col() +
coord_flip() +
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 11),
legend.position = "none") +
labs(title = "Current role", subtitle = "2017", x = "", y = "Count", fill = "",
caption = "")
# 2018
p2 <- afroCountries %>%
group_by(Q6) %>%
filter(Q6 != "Student") %>%
filter(!is.na(Q6)) %>%
summarise(Count = length(Q6)) %>%
ggplot(aes(x = reorder(Q6, Count), y = Count, fill = Q6)) +
geom_col() +
coord_flip()+
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 11),
legend.position = "none") +
labs(title = "Current role", subtitle = "2018", x = "", y = "Count", fill = "",
caption = "About us")
grid.arrange(p1,p2, ncol = 2)

Experience in current role
Most of the students have less than a year of experience, which is
not surprising. Except for 5 jobs (Consultant, Chief officer, Project
manager, research scientist, sales person), the majority (> 50%) of
the respondents working in each category have less than 3 years. Data
scientists, data journalist and data analysts in particular mostly have
less than a year of experience.
Given the age distribution of the respondents, this should be
expected. Most of them are young professionals.
afroCountries %>%
group_by(Q6,Q8) %>%
filter(!is.na(Q6)) %>%
filter(!is.na(Q8)) %>%
summarise(Count = length(Q8)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q8, y = Q6, fill = Count)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Experience in current role",
x = "Years", y = "", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Use of Machine learning in industry
Most of the students don’t know about any machine learning in their
school. Even though most industries don’t implement machine learning
methods, there are some that are starting and are exploring machine
learning methods.
afroCountries %>%
group_by(Q6,Q10) %>%
filter(!is.na(Q10)) %>%
summarise(Count = length(Q10)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q10, y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11),
axis.text.y = element_text(size = 10),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Use of ML in industries",
x = "", y = "", fill = "",
caption = "Machine learning usage")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Activities at work
The majority analyze and understand data to influence product or
business decisions. Most of them also perform each of the task at some
point, some much more than the others:
Research assistants and scientists do more research than the
others.
Database and data engineers build and run data
infrastructures.
Salespersons and Data journalists do none of these
tasks.
afroCountries %>%
select(Q6,Q11_Part_1,Q11_Part_2, Q11_Part_3,Q11_Part_4,Q11_Part_5,Q11_Part_6,Q11_Part_7)%>%
gather(2:8, key = "questions", value = "Function")%>%
group_by(Q6,Function)%>%
filter(!is.na(Function))%>%
summarise(Count = length(Function))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Function, y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 9),
legend.position = "none") +
labs(title = "Day to day function",
x = "", y = "", fill = "",
caption = "Machine learning use")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Coding experience
How long have you been coding to analyze data?
First of all, there is nobody who don’t write code that don’t want to
learn so yay! Most have been coding for less than 5
years, especially the women. Most of them are just getting started and
have 1-2 years of experience.
p1 <- afroCountries %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
group_by(Q1,Q24) %>%
filter(!is.na(Q24)) %>%
summarise(Count = length(Q24)) %>%
ggplot(aes(x = Q24, y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Coding experience",
x = "", y = "Count", fill = "")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
p2 <- afroCountries %>%
group_by(Q3,Q24) %>%
filter(!is.na(Q3)) %>%
filter(!is.na(Q24)) %>%
summarise(Count = length(Q24)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = Q24, y = pct, group = Q3)) +
geom_point(aes(color = Q3), size = 1.5) + geom_line(aes(color = Q3), size = 0.5) +
scale_fill_brewer(palette = "Set3") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 15)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Coding experience by country",
x = "", y = "%", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q3'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2,ncol = 2)

Data engineers have the highest percentage of longest experience in
coding, with 5-10 years. Most research scientists have 3-5 years of
coding experience.
afroCountries %>%
group_by(Q6,Q24) %>%
filter(!is.na(Q24)) %>%
summarise(Count = length(Q24)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q24, y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Coding experience", x = "", y = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Programming languages
Most used programming language
The most used programming language used by the respondents depend
highly on their current position. There is a clear Python domination,
but most statisticians among the respondents are mostly using R.
Database engineers who prefer SQL. It makes sense, everyone is choosing
what fits best for the job.
afroCountries %>%
group_by(Q6,Q17) %>%
filter(!is.na(Q17)) %>%
summarise(Count = length(Q17)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = reorder(Q17,-pct), y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = 45, hjust = 1),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Most used programming language", x = "", y = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Everybody is recommending Python to a new person who apsires to be a
data scientist.
afroCountries %>%
group_by(Q6,Q18) %>%
filter(!is.na(Q18)) %>%
summarise(Count = length(Q18)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = reorder(Q18,-pct), y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Recommended programming language", x = "", y = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Even those who regularly use another language to analyze data
recommend to learn Python first. But still, some faithfull R users are
recommending R.
afroCountries %>%
group_by(Q17,Q18) %>%
filter(!is.na(Q17)) %>%
filter(!is.na(Q18)) %>%
summarise(Count = length(Q17)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = reorder(Q17,pct), y = Q18, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 11),
axis.text.x = element_text(angle = 35, hjust = 1),
legend.text = element_text(size = 11)) +
labs(title = "Most used vs. Recommended programming languages",
x = "Most used", y = "Recommended",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q17'. You can override using the `.groups`
## argument.

Time spent actively coding
“Practice makes perfect”. Men are spending more time coding compared
to the women. Previous plots have shown that men have a lot more
experience in coding than women, so it makes sense. The women are still
learning. This also means that the more you can write code, the more
time you spent doing it.
afroCountries %>%
group_by(Q1,Q23) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q23)) %>%
summarise(Count = length(Q23)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = Q23, y = pct, group = Q1)) +
geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
scale_fill_brewer(palette = "Set3") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 12),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Time spent actively coding",
x = "of time", y = "% of people", fill = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Students, software engineers, data analysts and data scientists spent
the most time coding, from 50 to 74% of their time.
afroCountries %>%
group_by(Q6,Q23) %>%
filter(!is.na(Q23)) %>%
summarise(Count = length(Q23)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q23, y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 11),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Time spent coding",
x = "of time", y = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

IDEs
The most used IDE is Jupyter/Ipython. The second IDE of choice for
the researchers is MATLAB, while students prefer Notepad++. Data
engineers and data analysts are choosing RStudio.
afroCountries %>%
select(Q6,30:45)%>%
gather(2:16, key = "questions", value = "IDEs")%>%
group_by(Q6,IDEs)%>%
filter(!is.na(IDEs))%>%
summarise(Count = length(IDEs))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(IDEs,-pct), y = Q6, fill = Count)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "IDEs",
x = "", y = "", fill = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Hosted notebook
Most respondents don’t use hosted notebook. Kaggle kernels is the
choice for those who do.
afroCountries %>%
select(Q6,Q14_Part_1:Q14_Part_11)%>%
gather(2:12, key = "questions", value = "Hosted_Notebook")%>%
group_by(Q6,Hosted_Notebook)%>%
filter(!is.na(Hosted_Notebook))%>%
summarise(Count = length(Hosted_Notebook))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(Hosted_Notebook,-pct), y = Q6, fill = Count)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Hosted notebook used at school or work",
subtitle = "(past 5 years)",
x = "", y = "", fill = "",
caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Are you a data scientist?
With all the time spent coding to analyze data, most of the African
respondents tend to think of themselves as a data scientist. This was
kind of expected since a large part of the respondents are data
scientists. Only few women don’t think they are data scientists.
afroCountries %>%
filter(Q1 == "Female"| Q1 == "Male") %>%
group_by(Q1,Q26) %>%
filter(!is.na(Q26)) %>%
summarise(Count = length(Q26)) %>%
ggplot(aes(x = Q26, y = Count, fill = Q1))+
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 8)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Think of themself as a data scientist",
x = "", y = "Count", fill = "",
caption = "Personal views")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Compared to the other continents, Afrikagglers are more
confident in their data scientist identity: 59.89% answered “Yes”.
newMultipleChoice %>%
group_by(Continent,Q26) %>%
filter(!is.na(Q26)) %>%
summarise(Count = length(Q26)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q26, y = pct, group = Continent)) +
geom_line(aes(color = Continent), size = 0.5) +
geom_point(aes(color = Continent), size = 2) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Think of themself as a data scientist",
x = "", y = "%", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Machine learning methods use at work/school
Afrikagglers started using machine learning methods only
recently. The majority have used it for less than year.
afroCountries %>%
group_by(Q6,Q25) %>%
filter(!is.na(Q25)) %>%
summarise(Count = length(Q25)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q25, y = Q6, fill = Count)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Usage of machine learning at work/school",
x = "", y = "",
caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Machine learning products
The same goes with machine learning products. The majority of the
respondents have never used any machine learning product. Only a few of
them have used it, and most of them are using SAS.
afroCountries %>%
select(Q6,152:194)%>%
gather(2:44, key = "questions", value = "ML_Products")%>%
group_by(Q6,ML_Products)%>%
filter(!is.na(ML_Products))%>%
summarise(Count = length(ML_Products))%>%
mutate(pct = prop.table(Count)*100)%>%
top_n(5,pct) %>%
ggplot() +
geom_point(mapping = aes(x = reorder(ML_Products,-Count), y = Q6,
size = pct, color = ML_Products)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
axis.text.y = element_text(size = 8),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Machine learning products (past 5 years)",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

But (un)fortunately, Africa is not the only one in that situation.
There were ~25% of the overall respondents who haven’t use any machine
learning products either.
newMultipleChoice %>%
select(Continent,152:194)%>%
gather(2:44, key = "questions", value = "ML_Products")%>%
group_by(Continent,ML_Products)%>%
filter(!is.na(ML_Products))%>%
summarise(Count = length(ML_Products))%>%
mutate(pct = prop.table(Count)*100)%>%
top_n(5,Count) %>%
ggplot() +
geom_point(mapping = aes(x = Continent, y = reorder(ML_Products,pct),
size = pct, color = ML_Products)) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 8),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Machine learning products (past 5 years)",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

The usage of machine learning products differs from one continent to
the other.
Azure Machine Learning Studio, SAS and Cloudera are the most used
products in Africa.
Asian respondents prefer Google Cloud Speech-to-text API, Google
Cloud Vision API, Google Cloud Natural Language API.
Those from Europe use Cloudera, Azure Machine Learning Studio,
and RapidMiner.
Most used machine learning framework
Scikit-Learn, TensorFlow and Keras are the overall machine learning
frameworks, which is in concordance of the overall tendancy of all
countries.
afroCountries %>%
select(Q6,Q19_Part_1:Q19_Part_19)%>%
gather(2:19, key = "questions", value = "ML_Framework")%>%
group_by(Q6,ML_Framework)%>%
filter(!is.na(ML_Framework))%>%
summarise(Count = length(ML_Framework))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(ML_Framework,-Count), y = Q6, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Machine learning framework (past 5 years)",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Do you consider machine learning a “black box”
There are a lot more proportion of confident women, but also more
proportion of women that consider machine learning as black boxes.
afroCountries %>%
group_by(Q1,Q48) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q48)) %>%
filter(!is.na(Q1)) %>%
ggplot(aes(x = Q1, fill = Q48)) +
geom_bar(position = "fill") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
scale_fill_brewer(palette = "Set3") +
coord_flip() +
labs(title = "Do you consider ML as 'black boxes'?",
x = "", y = "", fill = "", caption = "Personal views") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
legend.position = "bottom",
axis.text = element_text(size = 12),
legend.text = element_text(size = 10)) +
guides(fill = guide_legend(ncol = 1))

Metrics used to determine success
The metrics used by African industries are those that consider
accuracy first (with more or less the same percentage), then comes
revenue and/or business goals. Morocco has the most percentage of people
not involved in the organization that build ML models in their
company.
South Africa and Kenya are more revenue oriented than the other
cities. This may explain the higher compensation rate in these countries
(cf. later paragraphs). Unfair bias is only slightly considered.
afroCountries %>%
select(Q42_Part_1:Q42_Part_5,Q3) %>%
gather(1:5, key = "questions", value = "metrics")%>%
group_by(metrics,Q3) %>%
filter(!is.na(metrics)) %>%
filter(!is.na(Q3)) %>%
ggplot(aes(x = Q3, fill = metrics)) +
geom_bar(position = "fill") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 15)) +
scale_fill_brewer(palette = "Set3") +
labs(title = "Metrics used to measure model success",
x = "", y = "", fill = "", caption = "Machine learning usage") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 11),
legend.text = element_text(size = 11),
legend.position = "bottom") +
guides(fill = guide_legend(ncol = 1))

That is also the case for all countries/continents.
newMultipleChoice %>%
select(Continent,Q42_Part_1:Q42_Part_5)%>%
gather(2:6, key = "questions", value = "Metrics")%>%
group_by(Continent,Metrics)%>%
filter(!is.na(Metrics))%>%
summarise(Count = length(Metrics))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Continent, y = reorder(Metrics,pct), fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)),
hjust = 0.5,vjust = 0.5, size = 4, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 20))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Metrics used by organizations",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

What about data?
Most used type of data
Most use numerical data, but data scientists handle tabular data
more.
afroCountries %>%
select(Q6,Q32) %>%
group_by(Q6,Q32) %>%
filter(!is.na(Q32)) %>%
summarise(Count = length(Q32)) %>%
mutate(pct = round(prop.table(Count)*100,2)) %>%
ggplot(aes(x = reorder(Q32,-Count), y = Q6, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "none",
axis.text.y = element_text(size = 11),
axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
legend.text = element_text(size = 11)) +
labs(title = "Most used data types",
x = "Type of data", y = "", fill = "",
caption = "Data")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Where to find public dataset
Dataset aggregator platforms are the main sources of data used,
followed by Google search and Github.
afroCountries %>%
select(Q1,266:276) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
gather(2:12, key = "questions", value = "DataSource")%>%
group_by(Q1,DataSource) %>%
filter(!is.na(DataSource)) %>%
summarise(Count = length(DataSource))%>%
ggplot(aes(x = reorder(DataSource,-Count), y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 20)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Sources used to get public datasets",
x = "", y = "Count", fill = "",
caption = "Data")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Other tools and services
Cloud computing services at work/school
A great number of the respondents have never used cloud computing
services. Those who do are using Amazon Web services, Google Cloud
platform or Microsoft Azure mostly.
afroCountries %>%
select(Q6,Q15_Part_1:Q15_Part_7)%>%
gather(2:8, key = "questions", value = "Cloud_services")%>%
group_by(Q6,Cloud_services)%>%
filter(!is.na(Cloud_services))%>%
summarise(Count = length(Cloud_services))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(Cloud_services,-Count), y = Q6, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 9),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Cloud computing services at work/school",
x = "", y = "", fill = "",
caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Cloud computing products
In general, a great proportion of the respondents are not using cloud
computing products, especially students and data analysts. Google
Compute Engine is the most used cloud computing products for those using
it.
afroCountries %>%
select(Q6,Q27_Part_1:Q27_Part_20) %>%
gather(2:21, key = "questions", value = "cloud")%>%
group_by(Q6,cloud)%>%
filter(!is.na(cloud))%>%
filter(!is.na(Q6))%>%
summarise(Count = length(cloud))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(cloud,-Count), y = Q6, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.5, size = 2.5, color = "white") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 10),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Cloud computing products (past 5 years)",
x = "", y = "", fill = "",
caption = "Cloud computing products")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Most used visualization tools
The love for Matplotlib is strong. It is followed by ggplot2 and
Seaborn, but by far.
afroCountries %>%
select(Q1,Q22)%>%
filter(Q1 == "Female" | Q1 == "Male") %>%
group_by(Q1,Q22)%>%
filter(!is.na(Q22))%>%
summarise(Count = length(Q22))%>%
ggplot(aes(x = reorder(Q22,-Count), y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.text = element_text(size = 11),
legend.position = "top") +
labs(title = "Most used vizualisation libraries",
x = "", y = "Count", fill = "",
caption = "Other tools")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

The choice of the visualization libraries depends of course on the
programming language used. R, SQL, SAS/STATA and JavaScript users are
using ggplot2. Python and Java users are using Matplotlib
afroCountries %>%
select(Q17,Q22)%>%
group_by(Q17,Q22)%>%
filter(!is.na(Q17)) %>%
filter(!is.na(Q22))%>%
summarise(Count = length(Q22))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = reorder(Q22,-Count), y = Q17, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)),
hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.y = element_text(size = 9),
axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
legend.text = element_text(size = 11),
legend.position = "none") +
labs(title = "Most used vizualisation library",
x = "", y = "Most used programming language", fill = "",
caption = "Vizualisation libraries")
## `summarise()` has grouped output by 'Q17'. You can override using the `.groups`
## argument.

Relational database
MySQL, Microsoft SQL Server and PostgresSQL are the three most used
relational database products these past five years and none of the
respondents uses Ingres.
afroCountries %>%
select(Q1,196:223) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
gather(2:29, key = "questions", value = "RDB_Products") %>%
group_by(Q1,RDB_Products) %>%
filter(!is.na(RDB_Products))%>%
filter(!is.na(Q1))%>%
summarise(Count = length(RDB_Products))%>%
ggplot(aes(x = reorder(RDB_Products,-Count), y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 8,
angle = -90, hjust = 0, vjust = 0.5),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Relational database products (past 5 years)",
x = "", y = "Count", fill = "",
caption = "Relational database")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Big data/analytics products
A huge number of African respondents are not using any big data and
analytics tools. Google BigQuery is the most used one for those using
big data and analytics tools, and there are only few of them.
afroCountries %>%
select(Q1,225:249) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
gather(2:26, key = "questions", value = "BigData_Products") %>%
group_by(Q1,BigData_Products) %>%
filter(!is.na(BigData_Products)) %>%
filter(!is.na(Q1)) %>%
summarise(Count = length(BigData_Products))%>%
ggplot(aes(x = reorder(BigData_Products,-Count), y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 9, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Big data and analytics tools (past 5 years)",
x = "", y = "Count", fill = "",
caption = "Big data and analytics tools")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Big data and analytics tools were not that popular in African and
Asian continents. In Africa in particular, almost half of the
respondents never used them.
newMultipleChoice %>%
select(Continent,225:249) %>%
gather(2:26, key = "questions", value = "BigData_Products") %>%
group_by(Continent,BigData_Products) %>%
filter(!is.na(BigData_Products)) %>%
summarise(Count = length(BigData_Products))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Continent, y = reorder(BigData_Products,pct), fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)),
hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 11),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Big data and analytics tools (past 5 years)",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Further education
Percentage of current machine learning and data science
training
To stay in top of the game, Afrikagglers use other means to
learn about new trends in machine learning and data science.
The median value of the percentage of machine learning and data
science is slightly higher for male respondents in Self-taught, Online
courses and Work. However, the median is higher for African women for
learning in University. The percentage of women having higher degrees is
higher than that of men in Africa, a higher percentage of them were/are
also studying computer science and Mathematics/Statistics, so this may
explain that. That being said, only a few of them are learning through
Kaggle competitions.
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
p1 <- multipleChoice18 %>%
select(Q1,Q35_Part_1) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "",y = Q35_Part_1, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "Self-taught", x = "", y = "%")
p2 <- multipleChoice18 %>%
select(Q1,Q35_Part_2) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "", y = Q35_Part_2, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "Online courses", x = "", y = "%")
p3 <- multipleChoice18 %>%
select(Q1,Q35_Part_3) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "",y = Q35_Part_3, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "Work", x = "", y = "%")
p4 <- multipleChoice18 %>%
select(Q1,Q35_Part_4) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "", y = Q35_Part_4, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "University", x = "", y = "%")
p5 <- multipleChoice18 %>%
select(Q1,Q35_Part_5) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "",y = Q35_Part_5, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "Kaggle competitions", x = "", y = "%")
p6 <- multipleChoice18 %>%
select(Q1,Q35_Part_6) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
ggplot(aes(x = "", y = Q35_Part_6, fill = Q1)) +
geom_boxplot() +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(size = 13, hjust = 0.5),
legend.text = element_text(size = 9),
legend.title = element_blank()) +
labs(title = "Other", x = "", y = "%")
grid.arrange(p1,p2, p3, p4, p5, p6, ncol = 3)
## Warning: Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).

Platform where you begun/completed Data Science courses
afroCountries %>%
select(Q3,Q36_Part_1:Q36_Part_13)%>%
gather(2:14, key = "questions", value = "OnlinePlatform")%>%
group_by(Q3,OnlinePlatform)%>%
filter(!is.na(OnlinePlatform))%>%
summarise(Count = length(OnlinePlatform))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = OnlinePlatform, y = pct, group = Q3)) +
geom_point(aes(color = Q3), size = 2) + geom_line(aes(color = Q3), size = 0.5) +
scale_fill_gradient(low = "salmon1", high = "blue") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
labs(title = "Platform for data science courses",
x = "", y = "%", fill = "",
caption = "")
## `summarise()` has grouped output by 'Q3'. You can override using the `.groups`
## argument.

Coursera is the most popular platform for learning, before DataCamp
and Udemy.
The latter seems to be more popular amongst the male users (in terms
of percentage) than the female. That is also the case for Kaggle Learn
(5th place) and edX (6th place).
afroCountries %>%
select(Q1,Q36_Part_1:Q36_Part_13)%>%
filter(Q1 == "Female"|Q1 == "Male")%>%
gather(2:14, key = "questions", value = "OnlinePlatform")%>%
group_by(Q1,OnlinePlatform)%>%
filter(!is.na(OnlinePlatform))%>%
summarise(Count = length(OnlinePlatform))%>%
ggplot(aes(x = reorder(OnlinePlatform,-Count), y = Count, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Online platform used for learning",
x = "", y = "Count", fill = "",
caption = "Online learning")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Platform where you spent most of your time
What do they think about …?
What do they think of MOOCs/in-person bootcamp?
A large number of the respondents are in favor of online learning,
compared to traditional institutions, especially professionals.
afroCountries %>%
group_by(Q6,Q39_Part_1) %>%
filter(!is.na(Q39_Part_1)) %>%
summarise(Count = length(Q39_Part_1)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = reorder(Q39_Part_1,-Count), y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 11),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "Online learning vs. Traditional institution",
x = "", y = "",
caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

The same goes with in-person bootcamp.
afroCountries %>%
group_by(Q6,Q39_Part_2) %>%
filter(!is.na(Q39_Part_2)) %>%
summarise(Count = length(Q39_Part_2)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = reorder(Q39_Part_2,-pct), y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 11),
axis.text.y = element_text(size = 9),
legend.text = element_text(size = 11)) +
labs(title = "In-person bootcamp vs. Traditional institution",
x = "", y = "",
caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Data science expertise
Most of the respondents thinks that independent projects are much
more important than academic achievements.
afroCountries %>%
select(Q1,Q40) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
group_by(Q1,Q40) %>%
filter(!is.na(Q40)) %>%
filter(!is.na(Q1)) %>%
summarise(Count = length(Q40))%>%
mutate(pct = prop.table(Count)*100) %>%
ggplot(aes(x = reorder(Q40,-pct), y = pct, fill = Q1)) +
geom_col() +
scale_fill_brewer(palette = "Paired") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 11)) +
labs(title = "Independent projects vs. Academic achievements",
x = "", y = "Count", fill = "",
caption = "Expertise in data science")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

45,49% of African respondents find that independent projects are much
more important than academic achievements.
newMultipleChoice %>%
group_by(Continent, Q40)%>%
filter(!is.na(Q40))%>%
summarise(Count = length(Q40))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Continent, y = reorder(Q40,pct), fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)),
hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(size = 11),
axis.text.y = element_text(size = 11),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Expertise in data science",
subtitle = "Independent projects vs. academic achievements",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Let’s talk about Money!
afroCountries %>%
group_by(Q1,Q9)%>%
filter(Q1 == "Female"|Q1 == "Male")%>%
filter(!is.na(Q9))%>%
summarise(Count = length(Q9))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Q9, y = pct, group = Q1)) +
geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
theme(plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, vjust = 0.5, hjust = 0),
axis.text.y = element_text(size = 12),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 11)) +
scale_fill_brewer(palette = "Paired") +
labs(title = "Yearly compensation",
x = "$", y = "%", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

The average earning, for the respondents who were willing to share,
are around 0-10,000 and 10,000-20,000$** for all countries.
The highest paid countries in Africa are Kenya and South Africa with
more than 300,000$ for male respondents.
afroCountries %>%
group_by(Q1,Q9,Q3) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q9)) %>%
summarise(Count = length(Q9)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q3, y = Q9, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
facet_grid(Q1~.) +
coord_flip() +
scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 11),
legend.text = element_text(size = 11)) +
labs(title = "Yearly compensation",
x = "", y = "$", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

Regardless of the degree, most of the respondents earn less than
10’000$ per year for both gender. At first glance, male
PhDs are paid way better than their female counterpart.
afroCountries %>%
group_by(Q1,Q9,Q4) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q4)) %>%
filter(!is.na(Q9)) %>%
summarise(Count = length(Q9)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q4, y = Q9, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
facet_grid(Q1~.) +
coord_flip() +
scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 11),
legend.text = element_text(size = 11)) +
labs(title = "Yearly compensation by degree",
x = "", y = "$", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

afroCountries %>%
group_by(Q1,Q9,Q6) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q6)) %>%
filter(!is.na(Q9)) %>%
summarise(Count = length(Q9)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q9, y = Q6, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
facet_grid(Q1~.) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 30)) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90,
hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 7),
legend.text = element_text(size = 11)) +
labs(title = "Yearly compensation vs. current role",
x = "$", y = "", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

To earn more money, you should be a data scientist,
a statistician or a data engineer. So,
it looks like money is where the data are.
If we cross-analyze these results with the experience in current
role, it appears that it takes 1-2 years of experience in women to earn
90-100’000$ as a data scientist, while men can earn 400’000
to more than 500’000$ with the same experience.
Years of experience seem to have little importance regarding how much
people earn (as a data scientist in particular), but gender has.
afroCountries %>%
group_by(Q1,Q9,Q8) %>%
filter(Q1 == "Female" | Q1 == "Male") %>%
filter(!is.na(Q8)) %>%
filter(!is.na(Q9)) %>%
summarise(Count = length(Q9)) %>%
mutate(pct = round(prop.table(Count)*100,2))%>%
ggplot(aes(x = Q9, y = Q8, fill = pct)) +
geom_tile(size = 0.5, show.legend = TRUE) +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
facet_grid(Q1~.) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 10)) +
theme(legend.position = "none",
plot.title = element_text(size = 15, hjust = 0.5),
axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
axis.text.y = element_text(size = 11),
legend.text = element_text(size = 11)) +
labs(title = "Yearly compensation by gender and experience in current role",
x = "$", y = "Years of experience", fill = "",
caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

The majority of African respondents are paid less than
10’000$. North America and Oceania have the largest
proportion of people that are paid more than 100’000$. In
Europe, the majority is in the 0-60’000$ range.
newMultipleChoice %>%
group_by(Continent, Q9)%>%
filter(!is.na(Q9))%>%
summarise(Count = length(Q9))%>%
mutate(pct = prop.table(Count)*100)%>%
ggplot(aes(x = Continent, y = Q9, fill = pct)) +
geom_tile(stat = "identity") +
scale_fill_gradient(low = "salmon1", high = "blue") +
geom_text(aes(label = sprintf("%.2f%%", pct)),
hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
theme(plot.title = element_text(size = 15, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(size = 11),
axis.text.y = element_text(size = 11),
legend.position = "none",
legend.text = element_text(size = 11)) +
labs(title = "Yearly compensation",
x = "", y = "", fill = "",
caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Summary and Conclusion
African respondents are from Egypt, Kenya, Morocco, Nigeria,
South Africa and Tunisia.
African countries are among the countries that have the highest
female to male ratio.
Most of the respondents are aged between 22 and 29 years
old.
The proportion of female students has decreased compared to last
year.
Female respondents are more highly educated than their male
counterparts.
Men are paid a lot more than women while having the same level of
experience.
Data science is still at its early age.
South Africa and Kenya have the highest paid jobs, and Data
science is one of the most lucrative jobs.
In general, the respondents have only a few years of coding
experience but are willing to learn. The respondents from South Africa
have the most coding experience. And most respondents from Nigeria have
less than a year of experience.
Coursera is the most popular plateform.
Python (by far), R and SQL are the most used programming
languages.
Most of the African respondents think of themselves as data
scientists.
It is a bit difficult to analyze such a small sample, but overall we
can say that Data Science and Machine Learning is still young in Africa.
It also have a bright future since more people are embarking in it. I am
hopeful that next year, we will have even more respondents. Overall, the
results tend to follow the global tendencies such as the use of Python.
But in other ways, Africa is different, as in the male to female
ratio.