Introducción

Kaggle se dirigió a una comunidad y escogió una muestra de personas en este caso africanas para realizar una encuesta con el objetivo de conocer las experiencias de estas en la ciencia de datos y Machine Learning. En este estudio se escogieron personas de distintos países para para tener una vista un poco más amplia y certera acerca del estudio.

Importar librerias

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggforce)

Datasets

I will be using three datasets for this kernel:

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 16716 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (212): GenderSelect, Country, EmploymentStatus, StudentStatus, LearningD...
## dbl  (13): Age, LearningCategorySelftTaught, LearningCategoryOnlineCourses, ...
## num   (1): CompensationAmount
## lgl   (2): WorkToolsFrequencyAngoss, WorkToolsFrequencyKNIMECommercial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 23860 Columns: 395
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (395): Time from Start to Finish (seconds), Q1, Q1_OTHER_TEXT, Q2, Q3, Q...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 56 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Continent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# STRING PROCESSING
# countries
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"Iran, Islamic Republic of...","Iran")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"I do not wish to disclose my location","Won't disclose")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United States of America","USA")

continents$Country <- str_replace(continents$Country,"Iran, Islamic Republic of...","Iran")
continents$Country <- str_replace(continents$Country,"I do not wish to disclose my location","Won't disclose")
continents$Country <- str_replace(continents$Country,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
continents$Country <- str_replace(continents$Country,"United States of America","USA")

# CONVERT CATEGORICAL DATA TO FACTOR
# age groups
multipleChoice18$Q2 <- factor(multipleChoice18$Q2,
                              levels = c("18-21","22-24","25-29",
                                        "30-34","35-39","40-44",
                                        "45-49","50-54","55-59",
                                        "60-69","70-79","80+"), 
                           labels = c("18-21","22-24","25-29",
                                      "30-34","35-39","40-44",
                                      "45-49","50-54","55-59",
                                      "60-69","70-79","80+"))

# degree
multipleChoice18$Q4 <- factor(multipleChoice18$Q4,
                              levels = c("Doctoral degree","Master’s degree","Bachelor’s degree","Professional degree",
                                         "No formal education past high school",
                                         "Some college/university study without earning a bachelor’s degree",
                                         "I prefer not to answer"), 
                           labels = c("PhD","Master","Bachelor","Professional",
                                         "High school","No degree","Won't disclose"))

# undergraduate major
multipleChoice18$Q5 <- factor(multipleChoice18$Q5, 
                               levels = c("Medical or life sciences (biology, chemistry, medicine, etc.)",
                                          "Computer science (software engineering, etc.)",
                                          "Engineering (non-computer focused)",
                                          "Mathematics or statistics",
                                          "A business discipline (accounting, economics, finance, etc.)",
                                          "Environmental science or geology",
                                          "Social sciences (anthropology, psychology, sociology, etc.)",
                                          "Physics or astronomy",
                                          "Information technology, networking, or system administration",
                                          "I never declared a major",
                                          "Other",
                                          "Humanities (history, literature, philosophy, etc.)") ,
                               labels = c("Medical/life sciences", "Computer science",
                                          "Engineering", "Mathematics/statistics",
                                          "A business discipline", "Physics/astronomy",
                                          "IT/Network/Sys. admin", "No major declared",
                                          "Humanities", "Env. science", "Social sciences", "Other"))

# In what industry is your current employer?
multipleChoice18$Q7 <- factor(multipleChoice18$Q7,
                              levels = c("Retail/Sales", "I am a student", 
                                         "Computers/Technology", "Accounting/Finance",
                                         "Academics/Education", 
                                         "Insurance/Risk Assessment","Other",
                                         "Energy/Mining", "Non-profit/Service",
                                         "Marketing/CRM", "Government/Public Service",
                                         "Manufacturing/Fabrication", 
                                         "Online Service/Internet-based Services",
                                         "Broadcasting/Communications",
                                         "Medical/Pharmaceutical",
                                         "Online Business/Internet-based Sales",
                                         "Military/Security/Defense",
                                         "Shipping/Transportation",
                                         "Hospitality/Entertainment/Sports"),
                              labels = c("Retail / Sales", "Student", 
                                         "Computers / Technology", "Accounting / Finance",
                                         "Academics / Education", 
                                         "Insurance / Risk Assessment","Other",
                                         "Energy / Mining", "Non-profit / Service",
                                         "Marketing / CRM", "Government / Public Service",
                                         "Manufacturing / Fabrication", 
                                         "Online Service / Internet-based Services",
                                         "Broadcasting / Communications",
                                         "Medical / Pharmaceutical",
                                         "Online Business / Internet-based Sales",
                                         "Military / Security/Defense",
                                         "Shipping / Transportation",
                                         "Hospitality / Entertainment/Sports"))

# experience in current role
multipleChoice18$Q8 <- factor(multipleChoice18$Q8, levels = c("0-1","1-2","2-3",
                                                   "3-4","4-5","5-10",
                                                   "10-15","15-20","20-25",
                                                   "25-30","30+"))

# yearly compensation
multipleChoice18$Q9 <- factor(multipleChoice18$Q9, 
                         levels = c("I do not wish to disclose my approximate yearly compensation",
                                   "0-10,000","10-20,000","20-30,000","30-40,000",
                                   "40-50,000","50-60,000","60-70,000","70-80,000",
                                   "80-90,000","90-100,000","100-125,000",
                                   "125-150,000","150-200,000","200-250,000",
                                   "250-300,000","300-400,000", "400-500,000","500,000+"),
                         labels = c("Won't disclose",
                                   "0-10,000","10-20,000","20-30,000","30-40,000",
                                   "40-50,000","50-60,000","60-70,000","70-80,000",
                                   "80-90,000","90-100,000","100-125,000",
                                   "125-150,000","150-200,000","200-250,000",
                                   "250-300,000","300-400,000", "400-500,000","500,000+"))

# time spent coding
multipleChoice18$Q23 <- factor(multipleChoice18$Q23, levels = c("0% of my time",
                                                                "1% to 25% of my time",
                                                                "25% to 49% of my time",
                                                                "50% to 74% of my time",
                                                                "75% to 99% of my time",
                                                                "100% of my time"), 
                               labels = c("0%","1% to 25%","25% to 49%",
                                          "50% to 74%","75% to 99%","100%"))
# coding experience
multipleChoice18$Q24 <- factor(multipleChoice18$Q24, 
                               levels = c("I have never written code and I do not want to learn",
                                          "I have never written code but I want to learn",
                                          "< 1 year","1-2 years","3-5 years","5-10 years",
                                          "10-20 years","20-30 years","30-40 years", "40+ years") ,
                               labels = c("I don't write code and don't want to learn",
                                         "I don't write code but want to learn",
                                        "< 1 year", "1-2 years", "3-5 years", 
                                       "5-10 years", "10-20 years","20-30 years","30-40 years", "40+ years")
)

# For how many years have you used machine learning methods
multipleChoice18$Q25 <- factor(multipleChoice18$Q25,
                               levels = c("I have never studied machine learning and I do not plan to", 
                                          "I have never studied machine learning but plan to learn in the future",
                               "< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years", 
                               "5-10 years", "10-15 years", "20+ years"),
                               labels = c("Never studied, do not plan to", 
                                          "Never studied, plan to learn",
                               "< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years", 
                               "5-10 years", "10-15 years", "20+ years"))

# use of machine learning in industries
multipleChoice18$Q10 <- factor(multipleChoice18$Q10,
                               levels = c("I do not know",
                                          "No (we do not use ML methods)",
                                          "We are exploring ML methods (and may one day put a model into production)",
                                          
                                          "We recently started using ML methods (i.e., models in production for less than 2 years)",
                                          "We have well established ML methods (i.e., models in production for more than 2 years)",
                                          "We use ML methods for generating insights (but do not put working models into production)"),
                               labels = c("I do not know", "No", "Exploring ML methods",
                                          "Recently started", "Well established ML methods", 
                                          "For generating insights"))
# expertise in data science
multipleChoice18$Q40 <- factor(multipleChoice18$Q40, 
                            levels = c("Independent projects are equally important as academic achievements",
                                       "Independent projects are much more important than academic achievements",
                                       "Independent projects are slightly more important than academic achievements",
                                       "Independent projects are slightly less important than academic achievements",
                                       "Independent projects are much less important than academic achievements",
                                       "No opinion; I do not know"),
                            labels = c("Equally important",
                                       "Much more important",
                                       "Slightly more important",
                                       "Less important",
                                       "Much less important",
                                       "No opinion/Don't know"))


# are you a data scientist?
multipleChoice18$Q26 <- factor(multipleChoice18$Q26, 
                               levels = c("Definitely yes", "Probably yes", "Maybe", 
                                          "Probably not", "Definitely not"), 
                               labels = c("Definitely yes", "Probably yes", "Maybe", 
                                          "Probably not", "Definitely not"))

Quiénes somos

newMultipleChoice %>%
  group_by(Continent) %>%
  summarise(Count = length(Continent)) %>%
  mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Continent,-Count), y = Count, fill = Continent)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
  geom_text(aes(label =as.character(Count)), 
            position = position_dodge(width = 1), 
            hjust = 0.5, vjust = -0.25, size = 3) +
  scale_fill_brewer(palette = "PuBu") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Number of respondents", 
       x = "", y = "Count", fill = "",
       caption = "Africa and the world")

En comparación con el año anterior, el número de encuestados africanos muestra un aumento significativo del 109,54%. 109.54%.

p1 <- df %>%
  group_by(Country,Year) %>%
  summarise(Count = length(Country)) %>%
  ggplot(aes(x = Year, y = Count, group = Country)) +
  geom_line(aes(color = Country), size = 0.5) +
  geom_point(aes(color = Country), size = 4) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_text(size = 12), 
        legend.position = "bottom",
        legend.title=element_blank(),
        legend.text = element_text(size = 10)) +
  labs(title = "Number of respondents",
       x = "", y = "Count", fill = "", caption = "")
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
p2 <- afroCountries %>% 
  group_by(Q1,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  summarise(Count = length(Q3)) %>%
  ggplot(aes(x = reorder(Q3,-Count), y = Count, fill = Q1)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Paired") + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 10)) +
  labs(title = "Country of residence", x = "", y = "", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2, ncol = 2)

Aparte del increíble aumento de encuestados de Nigeria, también podemos ver la introducción de nuevos encuestados de Marruecos y Túnez.

En las gráficas 4 y 5, la cantidad de mujeres es menor a la de los hombres pero con la mayor cantidad de encuestados de todos los países presentes en la gráfica. También se puede observar el que país Túnez es el país con mayor población entre hombres y mujeres. En la gráfica 5, También se puede observar en la gráfica de la derecha que, el país con menos personas a las que se le realizó la encuesta fue Marruecos. En conclusión, el país que ocupa el primer lugar es norte américa y el país que ocupa el último lugar es sur américa.

multipleChoice18 %>%
  group_by(Q1,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q3)) %>%
  summarise(Count = n()) %>%
  spread(Q1,Count) %>%
  mutate(ratio = Female/Male) %>%
  mutate(highlight_flag = ifelse((Q3 == "Egypt" | Q3 == "Kenya" | Q3 == "Morocco" |
                                   Q3 == "Nigeria" | Q3 == "Tunisia" | Q3 == "South Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Q3,-ratio), y = ratio, fill = ratio)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity") +
  scale_fill_brewer(palette = "Paired") +
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11),
        axis.text.x = element_text(size = 9.5, angle = -90,
                                   hjust = 0 , vjust = 0.5)) +  
  labs(title = "Female to Male ratio", 
       x = "", y = "Ratio", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

newMultipleChoice %>%
  group_by(Continent,Q1) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  summarise(Count = n()) %>%
  spread(Q1,Count) %>%
  mutate(ratio = Female/Male) %>%
  mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Continent,-ratio), y = ratio, fill = ratio)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
  scale_fill_brewer(palette = "PuBu") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 12),
        axis.text.x = element_text(size = 12)) +  
  labs(title = "Female to Male ratio", 
       x = "", y = "Ratio", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

En la gráfica 4, se puede observar que la mayor cantidad de encuestados, tanto hombres como mujeres tienen la edad entre 22 y 24 años.

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
ggplot(data = temp, 
       aes(x = Q2, fill = Q1)) +
  geom_bar(data = filter(temp, Q1 == "Male"), aes(y = Count), stat = "identity") +
  geom_bar(data = filter(temp, Q1 == "Female"), aes(y = -1*Count), stat = "identity") +
  scale_y_continuous(breaks = seq(-50,150,50), 
                     labels = as.character(c(seq(50,0,-50), seq(50,150,50)))) + 
  scale_fill_brewer(palette = "Paired") +
  coord_flip() +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text =   element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Age distribution in Africa", 
       x = "Age group (years)", y = "Count", fill = "", 
       caption = "")

Alrededor del 75% de los Afrikagglers son menores de 29 años.

Antecedentes educativos

África es el único continente donde la licenciatura es el título más obtenido. En todos los demás continentes, el título de maestría es el indicado, especialmente en Europa, donde los titulares de doctorados también son los más altos.

p1 <- newMultipleChoice %>%
  group_by(Continent,Q4) %>%
  filter(!is.na(Q4)) %>%
  summarise(Count = length(Continent)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q4, y = pct, group = Continent)) +
  geom_line(aes(color = Continent), size = 0.5) +
  geom_point(aes(color = Continent), size = 2) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Educational background", 
       x = "", y = "%", fill = "",
       caption = "")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
p2 <- afroCountries %>%
  group_by(Q1,Q4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q4)) %>%
  summarise(Count = length(Q4)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = "", y = pct, fill = Q4)) +
  geom_col(width = 1) + 
  scale_fill_brewer(palette = "Set3") + 
  facet_grid(Q1~.) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        legend.text = element_text(size = 11)) +
  labs(title = "Degree", 
       x = "", y = "", fill = "Degree",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2,ncol = 2)

Pero cuando miras de cerca en África, el porcentaje de mujeres que obtienen títulos de maestría y doctorado es más alto que el de los hombres. Además, solo un pequeño porcentaje de ellas no tiene ningún título, lo que no es el caso de los hombres.

Un alcalde sin titulación

El título universitario más popular de Kagglers es Informática. Las otras dos carreras son Matemáticas/Estadística e Ingeniería.

newMultipleChoice %>%
  group_by(Continent,Q5) %>%
  filter(!is.na(Q5)) %>%
  summarise(Count = length(Q5)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(Q5,pct), 
                           size = pct, color = Q5)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Undergraduate major", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

En África, el porcentaje de mujeres encuestadas con formación en informática es mayor que el de los hombres encuestados. Lo mismo ocurre con Matemáticas/Estadística. Entonces, ¡buen trabajo, señoras!

Además, un mayor porcentaje de hombres no han declarado ninguna carrera. Esto puede deberse al hecho de que algunos de ellos “no tienen título”.

afroCountries %>%
  group_by(Q1,Q5) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q5)) %>%
  summarise(Count = length(Q5)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q5,-pct), y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Undergraduate major", 
       x = "", y = "%", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Experiencia profesional

newMultipleChoice %>%
  group_by(Continent,Q6) %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Continent)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(Q6,pct), 
                           size = 5*pct, color = Q6)) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 8)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12),
        axis.text.y = element_text(size = 10),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Current role", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Alrededor del 25% de los encuestados son estudiantes. Luego, los trabajos más representados son los científicos de datos, los analistas de datos y los ingenieros de software.

afroCountries %>% 
  group_by(Q1,Q6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q6,-pct), y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 12), 
        axis.text.x = element_text(size = 11, angle = -90, 
                                   hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Current role", x = "", y = "%", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

afroCountries %>%
  group_by(Q6,Q3) %>% 
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q6, y = Q3, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 12), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Current role by country", 
       x = "", y = "", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

propStud17 <- afroCountries17 %>%
  group_by(GenderSelect,StudentStatus) %>%
  filter(GenderSelect == "Female" | GenderSelect == "Male") %>%
  filter(!is.na(StudentStatus)) %>%
  summarise(Count = length(StudentStatus)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = "", y = pct, fill = StudentStatus)) + 
  geom_col(width = 1) +
  coord_polar("y", start = pi / 3) +
  scale_fill_brewer(palette = "Paired") +
  facet_wrap(GenderSelect~.) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "top") +
  labs(title = "Proportion of students", subtitle = "2017", x = "", y = "", fill = "",
       caption = "")
## `summarise()` has grouped output by 'GenderSelect'. You can override using the
## `.groups` argument.
propStud <- propStud18 %>%
  group_by(Q1,Q6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = "", y = pct, fill = Q6)) +
  geom_col(width = 1) +
  coord_polar("y", start = pi / 3) +
  scale_fill_brewer(palette = "Paired") +
  facet_wrap(Q1~.) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "top") +
  labs(title = "Proportion of students", subtitle = "2018", x = "", y = "", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
grid.arrange(propStud17,propStud, ncol = 2)

# 2017
p1 <- afroCountries17 %>%
  group_by(CurrentJobTitleSelect) %>%
  filter(!is.na(CurrentJobTitleSelect)) %>%
  summarise(Count = length(CurrentJobTitleSelect)) %>%
  ggplot(aes(x = reorder(CurrentJobTitleSelect, Count), y = Count, fill = CurrentJobTitleSelect)) +
  geom_col() +
  coord_flip() + 
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 11),
        legend.position = "none") +
  labs(title = "Current role", subtitle = "2017", x = "", y = "Count", fill = "",
       caption = "")

# 2018
p2 <- afroCountries %>% 
  group_by(Q6) %>%
  filter(Q6 != "Student") %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  ggplot(aes(x = reorder(Q6, Count), y = Count, fill = Q6)) + 
  geom_col() +
  coord_flip()+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 11),
        legend.position = "none") +
  labs(title = "Current role", subtitle = "2018", x = "", y = "Count", fill = "",
       caption = "About us") 

grid.arrange(p1,p2, ncol = 2)

Experiencia en el cargo actual

afroCountries %>%
  group_by(Q6,Q8) %>%
  filter(!is.na(Q6)) %>%
  filter(!is.na(Q8)) %>%
  summarise(Count = length(Q8)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q8, y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Experience in current role", 
       x = "Years", y = "", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Uso del aprendizaje automático en la industria

afroCountries %>%
  group_by(Q6,Q10) %>%
  filter(!is.na(Q10)) %>%
  summarise(Count = length(Q10)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q10, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11), 
        axis.text.y = element_text(size = 10),
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Use of ML in industries", 
       x = "", y = "", fill = "",
       caption = "Machine learning usage")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Actividades en el trabajo

afroCountries %>% 
  select(Q6,Q11_Part_1,Q11_Part_2, Q11_Part_3,Q11_Part_4,Q11_Part_5,Q11_Part_6,Q11_Part_7)%>%
  gather(2:8, key = "questions", value = "Function")%>%
  group_by(Q6,Function)%>%
  filter(!is.na(Function))%>%
  summarise(Count = length(Function))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = Function, y = Q6, fill = pct)) + 
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 9), 
        legend.position = "none") + 
  labs(title = "Day to day function", 
       x = "", y = "", fill = "",
       caption = "Machine learning use")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Experiencia en codificación

¿Cuánto tiempo ha estado codificando para analizar datos?

En primer lugar, no hay nadie que no escriba código que no quiera aprender, ¡así que ! La mayoría ha estado codificando por menos de 5 años, especialmente las mujeres. La mayoría de ellos recién están comenzando y tienen 1-2 años de experiencia.

p1 <- afroCountries %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  group_by(Q1,Q24) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  ggplot(aes(x = Q24, y = Count, fill = Q1)) +
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Coding experience", 
       x = "", y = "Count", fill = "")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
p2 <- afroCountries %>%
  group_by(Q3,Q24) %>%
  filter(!is.na(Q3)) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q24, y = pct, group = Q3)) +
  geom_point(aes(color = Q3), size = 1.5) + geom_line(aes(color = Q3), size = 0.5) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Coding experience by country", 
       x = "", y = "%", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q3'. You can override using the `.groups`
## argument.
grid.arrange(p1,p2,ncol = 2)

Los ingenieros de datos tienen el porcentaje más alto de experiencia más larga en codificación, con 5-10 años. La mayoría de los científicos investigadores tienen de 3 a 5 años de experiencia en codificación.

afroCountries %>%
  group_by(Q6,Q24) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q24, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Coding experience", x = "", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Lenguajes de programación

Lenguaje de programación más utilizado

afroCountries %>%
  group_by(Q6,Q17) %>%
  filter(!is.na(Q17)) %>%
  summarise(Count = length(Q17)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q17,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = 45, hjust = 1), 
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Most used programming language", x = "", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Todo el mundo recomienda Python a una persona nueva que aspira a ser un científico de datos.

afroCountries %>%
  group_by(Q6,Q18) %>%
  filter(!is.na(Q18)) %>%
  summarise(Count = length(Q18)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q18,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11), 
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Recommended programming language", x = "", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Incluso aquellos que usan regularmente otro idioma para analizar datos recomiendan aprender Python primero. Pero aún así, algunos usuarios fieles de R recomiendan R.

afroCountries %>%
  group_by(Q17,Q18) %>%
  filter(!is.na(Q17)) %>%
  filter(!is.na(Q18)) %>%
  summarise(Count = length(Q17)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q17,pct), y = Q18, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11),
        axis.text.x = element_text(angle = 35, hjust = 1),
        legend.text = element_text(size = 11)) + 
  labs(title = "Most used vs. Recommended programming languages", 
       x = "Most used", y = "Recommended",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q17'. You can override using the `.groups`
## argument.

Tiempo dedicado a la codificación activa

afroCountries %>%
  group_by(Q1,Q23) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q23)) %>%
  summarise(Count = length(Q23)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q23, y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Time spent actively coding", 
       x = "of time", y = "% of people", fill = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Los estudiantes, ingenieros de software, analistas de datos y científicos de datos pasaron la mayor parte del tiempo codificando, del 50 al 74 % de su tiempo.

afroCountries %>%
  group_by(Q6,Q23) %>%
  filter(!is.na(Q23)) %>%
  summarise(Count = length(Q23)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q23, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Time spent coding", 
       x = "of time", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

IDE

El IDE más utilizado es Jupyter/Ipython. El segundo IDE elegido por los investigadores es MATLAB, mientras que los estudiantes prefieren Notepad++. Los ingenieros de datos y los analistas de datos eligen RStudio.

afroCountries %>% 
  select(Q6,30:45)%>%
  gather(2:16, key = "questions", value = "IDEs")%>%
  group_by(Q6,IDEs)%>%
  filter(!is.na(IDEs))%>%
  summarise(Count = length(IDEs))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(IDEs,-pct), y = Q6, fill = Count)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "IDEs", 
       x = "", y = "", fill = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Cuaderno alojado

La mayoría de los encuestados no utilizan portátiles alojados. Kaggle kernels es la elección para aquellos que lo hacen.

afroCountries %>% 
  select(Q6,Q14_Part_1:Q14_Part_11)%>%
  gather(2:12, key = "questions", value = "Hosted_Notebook")%>%
  group_by(Q6,Hosted_Notebook)%>%
  filter(!is.na(Hosted_Notebook))%>%
  summarise(Count = length(Hosted_Notebook))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(Hosted_Notebook,-pct), y = Q6, fill = Count)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Hosted notebook used at school or work", 
       subtitle = "(past 5 years)",
       x = "", y = "", fill = "",
       caption = "") 
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Herramienta principal para analizar datos

La mayoría de los encuestados, especialmente los estudiantes y los científicos de datos, prefieren usar entornos de desarrollo locales/alojados al analizar datos.

afroCountries %>%
  group_by(Q6,Q12_MULTIPLE_CHOICE) %>%
  filter(!is.na(Q12_MULTIPLE_CHOICE)) %>%
  summarise(Count = length(Q12_MULTIPLE_CHOICE)) %>%
  ggplot(aes(x = reorder(Q12_MULTIPLE_CHOICE,-Count), y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Primary tools for data analysis", 
       x = "", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

¿Eres un científico de datos?

afroCountries %>%
  filter(Q1 == "Female"| Q1 == "Male") %>%
  group_by(Q1,Q26) %>%
  filter(!is.na(Q26)) %>%
  summarise(Count = length(Q26)) %>%
  ggplot(aes(x = Q26, y = Count, fill = Q1))+
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 8)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Think of themself as a data scientist", 
       x = "", y = "Count", fill = "",
       caption = "Personal views")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

En comparación con los otros continentes, los Afrikagglers tienen más confianza en su identidad como científicos de datos: el 59,89 % respondió “Sí”.

newMultipleChoice %>%
  group_by(Continent,Q26) %>%
  filter(!is.na(Q26)) %>%
  summarise(Count = length(Q26)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q26, y = pct, group = Continent)) +
  geom_line(aes(color = Continent), size = 0.5) +
  geom_point(aes(color = Continent), size = 2) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) + 
  labs(title = "Think of themself as a data scientist", 
       x = "", y = "%", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Uso de métodos de aprendizaje automático en el trabajo o la escuela

Afrikagglers comenzó a usar métodos de aprendizaje automático recientemente. La mayoría lo ha usado por menos de un año.

afroCountries %>%
  group_by(Q6,Q25) %>%
  filter(!is.na(Q25)) %>%
  summarise(Count = length(Q25)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q25, y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Usage of machine learning at work/school", 
       x = "", y = "",
       caption = "Coding experience")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Productos de aprendizaje automático

afroCountries %>% 
  select(Q6,152:194)%>%
  gather(2:44, key = "questions", value = "ML_Products")%>%
  group_by(Q6,ML_Products)%>%
  filter(!is.na(ML_Products))%>%
  summarise(Count = length(ML_Products))%>%
  mutate(pct =  prop.table(Count)*100)%>% 
  top_n(5,pct) %>%
  ggplot() + 
  geom_point(mapping = aes(x = reorder(ML_Products,-Count), y = Q6, 
                           size = pct, color = ML_Products)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 8, angle = 45, hjust = 1), 
        axis.text.y = element_text(size = 8), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

newMultipleChoice %>% 
  select(Continent,152:194)%>%
  gather(2:44, key = "questions", value = "ML_Products")%>%
  group_by(Continent,ML_Products)%>%
  filter(!is.na(ML_Products))%>%
  summarise(Count = length(ML_Products))%>%
  mutate(pct =  prop.table(Count)*100)%>% 
  top_n(5,Count) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(ML_Products,pct), 
                           size = pct, color = ML_Products)) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 8), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Framework de aprendizaje automático más utilizado

Scikit-Learn, TensorFlow y Keras son los marcos generales de aprendizaje automático, que concuerdan con la tendencia general de todos los países.

afroCountries %>% 
  select(Q6,Q19_Part_1:Q19_Part_19)%>%
  gather(2:19, key = "questions", value = "ML_Framework")%>%
  group_by(Q6,ML_Framework)%>%
  filter(!is.na(ML_Framework))%>%
  summarise(Count = length(ML_Framework))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(ML_Framework,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning framework (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

¿Considera el aprendizaje automático como una “caja negra”?

afroCountries %>%
  group_by(Q1,Q48) %>% 
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q48)) %>%
  filter(!is.na(Q1)) %>%
  ggplot(aes(x = Q1, fill = Q48)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  scale_fill_brewer(palette = "Set3") +
  coord_flip() +
  labs(title = "Do you consider ML as 'black boxes'?", 
       x = "", y = "", fill = "", caption = "Personal views") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        legend.position = "bottom", 
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 10)) +
  guides(fill = guide_legend(ncol = 1)) 

afroCountries %>% 
  select(Q42_Part_1:Q42_Part_5,Q3) %>%
  gather(1:5, key = "questions", value = "metrics")%>%
  group_by(metrics,Q3) %>%
  filter(!is.na(metrics)) %>%
  filter(!is.na(Q3)) %>%
  ggplot(aes(x = Q3, fill = metrics)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 15)) +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Metrics used to measure model success", 
       x = "", y = "", fill = "", caption = "Machine learning usage") +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11), 
        legend.position = "bottom") +
  guides(fill = guide_legend(ncol = 1))

Ese es también el caso para todos los países/continentes.

newMultipleChoice %>% 
  select(Continent,Q42_Part_1:Q42_Part_5)%>%
  gather(2:6, key = "questions", value = "Metrics")%>%
  group_by(Continent,Metrics)%>%
  filter(!is.na(Metrics))%>%
  summarise(Count = length(Metrics))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(Metrics,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.5, size = 4, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 20))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Metrics used by organizations", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

¿Qué pasa con los datos?

Tipo de datos más utilizado

afroCountries %>% 
  select(Q6,Q32) %>%
  group_by(Q6,Q32) %>%
  filter(!is.na(Q32)) %>%
  summarise(Count = length(Q32)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q32,-Count), y = Q6, fill = pct)) +
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "none", 
        axis.text.y = element_text(size = 11),
        axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
        legend.text = element_text(size = 11)) +
  labs(title = "Most used data types", 
       x = "Type of data", y = "", fill = "", 
       caption = "Data")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Dónde encontrar conjuntos de datos públicos

afroCountries %>% 
  select(Q1,266:276) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:12, key = "questions", value = "DataSource")%>%
  group_by(Q1,DataSource) %>%
  filter(!is.na(DataSource)) %>%
  summarise(Count = length(DataSource))%>%
  ggplot(aes(x = reorder(DataSource,-Count), y = Count, fill = Q1)) + 
  geom_col() +
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Sources used to get public datasets", 
       x = "", y = "Count", fill = "",
       caption = "Data")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Otras herramientas y servicios

Servicios de computación en la nube en el trabajo/escuela

afroCountries %>% 
  select(Q6,Q15_Part_1:Q15_Part_7)%>%
  gather(2:8, key = "questions", value = "Cloud_services")%>%
  group_by(Q6,Cloud_services)%>%
  filter(!is.na(Cloud_services))%>%
  summarise(Count = length(Cloud_services))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(Cloud_services,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Cloud computing services at work/school", 
       x = "", y = "", fill = "",
       caption = "") 
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Productos de computación en la nube

afroCountries %>% 
  select(Q6,Q27_Part_1:Q27_Part_20) %>%
  gather(2:21, key = "questions", value = "cloud")%>%
  group_by(Q6,cloud)%>%
  filter(!is.na(cloud))%>%
  filter(!is.na(Q6))%>%
  summarise(Count = length(cloud))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(cloud,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 2.5, color = "white") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Cloud computing products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Cloud computing products")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Herramientas de visualización más utilizadas

El amor por Matplotlib es fuerte. Le siguen ggplot2 y Seaborn, pero de lejos.

afroCountries %>% 
  select(Q1,Q22)%>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  group_by(Q1,Q22)%>%
  filter(!is.na(Q22))%>%
  summarise(Count = length(Q22))%>%
  ggplot(aes(x = reorder(Q22,-Count), y = Count, fill = Q1)) + 
  geom_col() +
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.text = element_text(size = 11),
        legend.position = "top") +
  labs(title = "Most used vizualisation libraries", 
       x = "", y = "Count", fill = "",
       caption = "Other tools")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

afroCountries %>% 
  select(Q17,Q22)%>%
  group_by(Q17,Q22)%>%
  filter(!is.na(Q17)) %>%
  filter(!is.na(Q22))%>%
  summarise(Count = length(Q22))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(Q22,-Count), y = Q17, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.y = element_text(size = 9),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.text = element_text(size = 11),
        legend.position = "none") +
  labs(title = "Most used vizualisation library", 
       x = "", y = "Most used programming language", fill = "",
       caption = "Vizualisation libraries")
## `summarise()` has grouped output by 'Q17'. You can override using the `.groups`
## argument.

Base de datos relacional

afroCountries %>% 
  select(Q1,196:223) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:29, key = "questions", value = "RDB_Products") %>%
  group_by(Q1,RDB_Products) %>%
  filter(!is.na(RDB_Products))%>%
  filter(!is.na(Q1))%>%
  summarise(Count = length(RDB_Products))%>%
  ggplot(aes(x = reorder(RDB_Products,-Count), y = Count, fill = Q1)) +
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +  
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 8, 
                                   angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Relational database products (past 5 years)", 
       x = "", y = "Count", fill = "",
       caption = "Relational database")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Big data/productos analíticos

afroCountries %>% 
  select(Q1,225:249) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:26, key = "questions", value = "BigData_Products") %>%
  group_by(Q1,BigData_Products) %>%
  filter(!is.na(BigData_Products)) %>%
  filter(!is.na(Q1)) %>%
  summarise(Count = length(BigData_Products))%>%
  ggplot(aes(x = reorder(BigData_Products,-Count), y = Count, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 9, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Big data and analytics tools (past 5 years)", 
       x = "", y = "Count", fill = "",
       caption = "Big data and analytics tools")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

newMultipleChoice %>% 
  select(Continent,225:249) %>%
  gather(2:26, key = "questions", value = "BigData_Products") %>%
  group_by(Continent,BigData_Products) %>%
  filter(!is.na(BigData_Products)) %>%
  summarise(Count = length(BigData_Products))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(BigData_Products,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Big data and analytics tools (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Educación avanzada

Porcentaje de capacitación actual en aprendizaje automático y ciencia de datos

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción

## Warning in lapply(X = X, FUN = FUN, ...): NAs introducidos por coerción
p1 <- multipleChoice18 %>%
  select(Q1,Q35_Part_1) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_1, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Self-taught", x = "", y = "%")
  
p2 <- multipleChoice18 %>%
  select(Q1,Q35_Part_2) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_2, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Online courses", x = "", y = "%")

p3 <- multipleChoice18 %>%
  select(Q1,Q35_Part_3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_3, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Work", x = "", y = "%")
  
p4 <- multipleChoice18 %>%
  select(Q1,Q35_Part_4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_4, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "University", x = "", y = "%")

p5 <- multipleChoice18 %>%
  select(Q1,Q35_Part_5) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_5, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Kaggle competitions", x = "", y = "%")
  
p6 <- multipleChoice18 %>%
  select(Q1,Q35_Part_6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_6, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13, hjust = 0.5),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Other", x = "", y = "%")

grid.arrange(p1,p2, p3, p4, p5, p6, ncol = 3)
## Warning: Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).
## Removed 7916 rows containing non-finite values (`stat_boxplot()`).

Plataforma donde comenzaste/completaste cursos de Data Science

afroCountries %>% 
  select(Q3,Q36_Part_1:Q36_Part_13)%>%
  gather(2:14, key = "questions", value = "OnlinePlatform")%>%
  group_by(Q3,OnlinePlatform)%>%
  filter(!is.na(OnlinePlatform))%>%
  summarise(Count = length(OnlinePlatform))%>%
  mutate(pct =  prop.table(Count)*100)%>% 
  ggplot(aes(x = OnlinePlatform, y = pct, group = Q3)) +
  geom_point(aes(color = Q3), size = 2) + geom_line(aes(color = Q3), size = 0.5) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) + 
  labs(title = "Platform for data science courses", 
       x = "", y = "%", fill = "",
       caption = "")
## `summarise()` has grouped output by 'Q3'. You can override using the `.groups`
## argument.

Coursera is the most popular platform for learning, before DataCamp and Udemy.

The latter seems to be more popular amongst the male users (in terms of percentage) than the female. That is also the case for Kaggle Learn (5th place) and edX (6th place).

afroCountries %>% 
  select(Q1,Q36_Part_1:Q36_Part_13)%>%
  filter(Q1 == "Female"|Q1 == "Male")%>%
  gather(2:14, key = "questions", value = "OnlinePlatform")%>%
  group_by(Q1,OnlinePlatform)%>%
  filter(!is.na(OnlinePlatform))%>%
  summarise(Count = length(OnlinePlatform))%>%
  ggplot(aes(x = reorder(OnlinePlatform,-Count), y = Count, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Online platform used for learning", 
       x = "", y = "Count", fill = "",
       caption = "Online learning")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Plataforma donde pasaste la mayor parte de tu tiempo

Fuentes de medios favoritas

afroCountries %>% 
  select(Q1,Q38_Part_1:Q38_Part_22)%>%
  filter(Q1 == "Female"|Q1 == "Male")%>%
  gather(2:14, key = "questions", value = "OnlinePlatform")%>%
  group_by(Q1,OnlinePlatform)%>%
  filter(!is.na(OnlinePlatform))%>%
  summarise(Count = length(OnlinePlatform))%>%
  ggplot(aes(x = reorder(OnlinePlatform,-Count), y = Count, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Online platform used for learning", 
       x = "", y = "Count", fill = "",
       caption = "Online learning")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

¿Qué opinan sobre…?

¿Qué opinan de los MOOC/bootcamp presenciales?

afroCountries %>%
  group_by(Q6,Q39_Part_1) %>%
  filter(!is.na(Q39_Part_1)) %>%
  summarise(Count = length(Q39_Part_1)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q39_Part_1,-Count), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Online learning vs. Traditional institution", 
       x = "", y = "",
       caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

afroCountries %>%
  group_by(Q6,Q39_Part_2) %>%
  filter(!is.na(Q39_Part_2)) %>%
  summarise(Count = length(Q39_Part_2)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q39_Part_2,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "In-person bootcamp vs. Traditional institution", 
       x = "", y = "",
       caption = "")
## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Experiencia en ciencia de datos

La mayoría de los encuestados piensa que los proyectos independientes son mucho más importantes que los logros académicos.

afroCountries %>% 
  select(Q1,Q40) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  group_by(Q1,Q40) %>%
  filter(!is.na(Q40)) %>%
  filter(!is.na(Q1)) %>%
  summarise(Count = length(Q40))%>%
  mutate(pct = prop.table(Count)*100) %>%
  ggplot(aes(x = reorder(Q40,-pct), y = pct, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Independent projects vs. Academic achievements", 
       x = "", y = "Count", fill = "",
       caption = "Expertise in data science")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

El 45,49 % de los encuestados africanos considera que los proyectos independientes son mucho más importantes que los logros académicos.

newMultipleChoice %>% 
  group_by(Continent, Q40)%>%
  filter(!is.na(Q40))%>%
  summarise(Count = length(Q40))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(Q40,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Expertise in data science",
       subtitle = "Independent projects vs. academic achievements",
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

¡Hablemos de Dinero!

afroCountries %>% 
  group_by(Q1,Q9)%>%
  filter(Q1 == "Female"|Q1 == "Male")%>%
  filter(!is.na(Q9))%>%
  summarise(Count = length(Q9))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Q9, y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, vjust = 0.5, hjust = 0),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) + 
  scale_fill_brewer(palette = "Paired") +
  labs(title = "Yearly compensation", 
       x = "$", y = "%", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

El ingreso promedio, para los encuestados que estaban dispuestos a compartir, es de alrededor de 0-10 000 y 10 000-20 000$** para todos los países. Los países mejor pagados de África son Kenia y Sudáfrica con más de 300.000$ para los hombres encuestados.

afroCountries %>%
  group_by(Q1,Q9,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q3, y = Q9, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) + 
  facet_grid(Q1~.) +
  coord_flip() +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,  
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Yearly compensation", 
       x = "", y = "$", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

afroCountries %>%
  group_by(Q1,Q9,Q4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q4)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q4, y = Q9, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) + 
  facet_grid(Q1~.) +
  coord_flip() +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,  
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Yearly compensation by degree", 
       x = "", y = "$", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

afroCountries %>%
  group_by(Q1,Q9,Q6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q6)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q9, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
  facet_grid(Q1~.) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, 
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 7), 
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation vs. current role", 
       x = "$", y = "", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

afroCountries %>%
  group_by(Q1,Q9,Q8) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q8)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q9, y = Q8, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
  facet_grid(Q1~.) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 10)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation by gender and experience in current role", 
       x = "$", y = "Years of experience", fill = "",
       caption = "About us")
## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

The majority of African respondents are paid less than 10’000$. North America and Oceania have the largest proportion of people that are paid more than 100’000$. In Europe, the majority is in the 0-60’000$ range.

newMultipleChoice %>% 
  group_by(Continent, Q9)%>%
  filter(!is.na(Q9))%>%
  summarise(Count = length(Q9))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = Q9, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation",
       x = "", y = "", fill = "",
       caption = "Africa and the world")
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

#Nota importante: Los textos utilizados en las gráficas son propios del autor original de la encuesta.