#Informe 1 - Ejercicio 1

Este informe fue realizado por John Gutiérrez, Andrea Jiménez y Laura Vesga donde se busca recopilar información y tener una visión general sobre la comunidad africana de Kaggle. Además, se quieren conocer sus experiencias con las ciencias de datos y el machine learning, comparándolo con otros continentes.

Importar las librerias necesarias

El primer paso es importar las librerias necesarias.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(grid)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggforce)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 15893 Columns: 228
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (215): GenderSelect, Country, Age, EmploymentStatus, StudentStatus, Lear...
## dbl   (5): JobSkillImportanceDegree, LearningCategorySelftTaught, LearningCa...
## lgl   (8): LearningPlatformUsefulnessSO, LearningPlatformUsefulnessTradeBook...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 23861 Columns: 395
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (126): Time from Start to Finish (seconds), Q1, Q1_OTHER_TEXT, Q2, Q3, Q...
## dbl  (33): Q16_OTHER_TEXT, Q17_OTHER_TEXT, Q18_OTHER_TEXT, Q19_OTHER_TEXT, Q...
## lgl (236): Q16_Part_10, Q16_Part_12, Q16_Part_13, Q16_Part_14, Q16_Part_15, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 56 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Continent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#STRING PROCESSING
#Countries
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"Iran, Islamic Republic of...","Iran")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"I do not wish to disclose my location","Won't disclose")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United States of America","USA")

continents$Country <- str_replace(continents$Country,"Iran, Islamic Republic of...","Iran")
continents$Country <- str_replace(continents$Country,"I do not wish to disclose my location","Won't disclose")
continents$Country <- str_replace(continents$Country,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
continents$Country <- str_replace(continents$Country,"United States of America","USA")

#CONVERT CATEGORICAL DATA TO FACTOR
#Age groups
multipleChoice18$Q2 <- factor(multipleChoice18$Q2,
                              levels = c("18-21","22-24","25-29",
                                        "30-34","35-39","40-44",
                                        "45-49","50-54","55-59",
                                        "60-69","70-79","80+"), 
                           labels = c("18-21","22-24","25-29",
                                      "30-34","35-39","40-44",
                                      "45-49","50-54","55-59",
                                      "60-69","70-79","80+"))
# Degree
multipleChoice18$Q4 <- factor(multipleChoice18$Q4,
                              levels = c("Doctoral degree","Master’s degree","Bachelor’s degree","Professional degree",
                                         "No formal education past high school",
                                         "Some college/university study without earning a bachelor’s degree",
                                         "I prefer not to answer"), 
                           labels = c("PhD","Master","Bachelor","Professional",
                                         "High school","No degree","Won't disclose"))
                            
#Undergraduate major
multipleChoice18$Q5 <- factor(multipleChoice18$Q5, 
                               levels = c("Medical or life sciences (biology, chemistry, medicine, etc.)",
                                          "Computer science (software engineering, etc.)",
                                          "Engineering (non-computer focused)",
                                          "Mathematics or statistics",
                                          "A business discipline (accounting, economics, finance, etc.)",
                                          "Environmental science or geology",
                                          "Social sciences (anthropology, psychology, sociology, etc.)",
                                          "Physics or astronomy",
                                          "Information technology, networking, or system administration",
                                          "I never declared a major",
                                          "Other",
                                          "Humanities (history, literature, philosophy, etc.)") ,
                               labels = c("Medical/life sciences", "Computer science",
                                          "Engineering", "Mathematics/statistics",
                                          "A business discipline", "Physics/astronomy",
                                          "IT/Network/Sys. admin", "No major declared",
                                          "Humanities", "Env. science", "Social sciences", "Other"))


# In what industry is your current employer?
multipleChoice18$Q7 <- factor(multipleChoice18$Q7,
                              levels = c("Retail/Sales", "I am a student", 
                                         "Computers/Technology", "Accounting/Finance",
                                         "Academics/Education", 
                                         "Insurance/Risk Assessment","Other",
                                         "Energy/Mining", "Non-profit/Service",
                                         "Marketing/CRM", "Government/Public Service",
                                         "Manufacturing/Fabrication", 
                                         "Online Service/Internet-based Services",
                                         "Broadcasting/Communications",
                                         "Medical/Pharmaceutical",
                                         "Online Business/Internet-based Sales",
                                         "Military/Security/Defense",
                                         "Shipping/Transportation",
                                         "Hospitality/Entertainment/Sports"),
                              labels = c("Retail / Sales", "Student", 
                                         "Computers / Technology", "Accounting / Finance",
                                         "Academics / Education", 
                                         "Insurance / Risk Assessment","Other",
                                         "Energy / Mining", "Non-profit / Service",
                                         "Marketing / CRM", "Government / Public Service",
                                         "Manufacturing / Fabrication", 
                                         "Online Service / Internet-based Services",
                                         "Broadcasting / Communications",
                                         "Medical / Pharmaceutical",
                                         "Online Business / Internet-based Sales",
                                         "Military / Security/Defense",
                                         "Shipping / Transportation",
                                         "Hospitality / Entertainment/Sports"))

#Experience in current role
multipleChoice18$Q8 <- factor(multipleChoice18$Q8, levels = c("0-1","1-2","2-3",
                                                   "3-4","4-5","5-10",
                                                   "10-15","15-20","20-25",
                                                   "25-30","30+"))


#Yearly compensation
multipleChoice18$Q9 <- factor(multipleChoice18$Q9, 
                         levels = c("I do not wish to disclose my approximate yearly compensation",
                                   "0-10,000","10-20,000","20-30,000","30-40,000",
                                   "40-50,000","50-60,000","60-70,000","70-80,000",
                                   "80-90,000","90-100,000","100-125,000",
                                   "125-150,000","150-200,000","200-250,000",
                                   "250-300,000","300-400,000", "400-500,000","500,000+"),
                         labels = c("Won't disclose",
                                   "0-10,000","10-20,000","20-30,000","30-40,000",
                                   "40-50,000","50-60,000","60-70,000","70-80,000",
                                   "80-90,000","90-100,000","100-125,000",
                                   "125-150,000","150-200,000","200-250,000",
                                   "250-300,000","300-400,000", "400-500,000","500,000+"))


#Time spent coding
multipleChoice18$Q23 <- factor(multipleChoice18$Q23, levels = c("0% of my time",
                                                                "1% to 25% of my time",
                                                                "25% to 49% of my time",
                                                                "50% to 74% of my time",
                                                                "75% to 99% of my time",
                                                                "100% of my time"), 
                               labels = c("0%","1% to 25%","25% to 49%",
                                          "50% to 74%","75% to 99%","100%"))

#Coding experience
multipleChoice18$Q24 <- factor(multipleChoice18$Q24, 
                               levels = c("I have never written code and I do not want to learn",
                                          "I have never written code but I want to learn",
                                          "< 1 year","1-2 years","3-5 years","5-10 years",
                                          "10-20 years","20-30 years","30-40 years", "40+ years") ,
                               labels = c("I don't write code and don't want to learn",
                                         "I don't write code but want to learn",
                                        "< 1 year", "1-2 years", "3-5 years", 
                                       "5-10 years", "10-20 years","20-30 years","30-40 years", "40+ years")
)

# For how many years have you used machine learning methods
multipleChoice18$Q25 <- factor(multipleChoice18$Q25,
                               levels = c("I have never studied machine learning and I do not plan to", 
                                          "I have never studied machine learning but plan to learn in the future",
                               "< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years", 
                               "5-10 years", "10-15 years", "20+ years"),
                               labels = c("Never studied, do not plan to", 
                                          "Never studied, plan to learn",
                               "< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years", 
                               "5-10 years", "10-15 years", "20+ years"))

# use of machine learning in industries
multipleChoice18$Q10 <- factor(multipleChoice18$Q10,
                               levels = c("I do not know",
                                          "No (we do not use ML methods)",
                                          "We are exploring ML methods (and may one day put a model into production)",
                                          
                                          "We recently started using ML methods (i.e., models in production for less than 2 years)",
                                          "We have well established ML methods (i.e., models in production for more than 2 years)",
                                          "We use ML methods for generating insights (but do not put working models into production)"),
                               labels = c("I do not know", "No", "Exploring ML methods",
                                          "Recently started", "Well established ML methods", 
                                          "For generating insights"))


# expertise in data science
multipleChoice18$Q40 <- factor(multipleChoice18$Q40, 
                            levels = c("Independent projects are equally important as academic achievements",
                                       "Independent projects are much more important than academic achievements",
                                       "Independent projects are slightly more important than academic achievements",
                                       "Independent projects are slightly less important than academic achievements",
                                       "Independent projects are much less important than academic achievements",
                                       "No opinion; I do not know"),
                            labels = c("Equally important",
                                       "Much more important",
                                       "Slightly more important",
                                       "Less important",
                                       "Much less important",
                                       "No opinion/Don't know"))


# are you a data scientist?
multipleChoice18$Q26 <- factor(multipleChoice18$Q26, 
                               levels = c("Definitely yes", "Probably yes", "Maybe", 
                                          "Probably not", "Definitely not"), 
                               labels = c("Definitely yes", "Probably yes", "Maybe", 
                                          "Probably not", "Definitely not"))

Nacionalidad de encuestados

Tenemos 57 países representados en esta encuesta, de los cuales 6 de ellos son de africanos. La gráfica busca comparar el número de participantes de África con los de los demás continentes. En esta, África se encuentra en la posición 5 con un total de 681 encuestados, también podemos ver que Oceanía es el continente con el menor número de encuestados y que Asia es el que cuenta con la mayor cantidad de estos.

newMultipleChoice %>%
  group_by(Continent) %>%
  summarise(Count = length(Continent)) %>%
  mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Continent,-Count), y = Count, fill = Continent)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
  geom_text(aes(label =as.character(Count)), 
            position = position_dodge(width = 1), 
            hjust = 0.5, vjust = -0.25, size = 3) +
  scale_fill_brewer(palette = "PuBu") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Number of respondents", 
       x = "", y = "Count", fill = "",
       caption = "Africa and the world")

Las gráficas que encontramos en la parte de abajo a la izquierda, nos muestran una comparación entre el número de encuestados en distintos países de africa (entre los años 2017 y 2018). En el informe original, el número de estos incrementó en un 109.54%; el país que más avanzó en este aspecto fue Nigeria pues muestra el mayor incremento en su número de participantes; observamos también que Marruecos y Tunisia acaban de entrar en la encuesta, ya que no se observa presencia de datos antiguos. La gráfica que se encuentra en la derecha muestra el género de los encuestaods, donde vemos que la mayoría de ellos son hombres.

p1 <- df %>%
  group_by(Country,Year) %>%
  summarise(Count = length(Country)) %>%
  ggplot(aes(x = Year, y = Count, group = Country)) +
  geom_line(aes(color = Country), size = 0.5) +
  geom_point(aes(color = Country), size = 4) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_text(size = 12), 
        legend.position = "bottom",
        legend.title=element_blank(),
        legend.text = element_text(size = 10)) +
  labs(title = "Number of respondents",
       x = "", y = "Count", fill = "", caption = "")

## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

p2 <- afroCountries %>% 
  group_by(Q1,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  summarise(Count = length(Q3)) %>%
  ggplot(aes(x = reorder(Q3,-Count), y = Count, fill = Q1)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Paired") + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 10)) +
  labs(title = "Country of residence", x = "", y = "", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

grid.arrange(p1,p2, ncol = 2)

Distribución de género de los encuestados

Como se muestra en la gráfica (derecha) anterior, el número de encuestados de género femenino es menor al masculino. En la gráfica que viene observamos la proporción de mujeres-hombres para los países encuestados, donde vemos que Tunisia tiene la mayor proporción de mujeres-hombres y los otros 5 países africanos entran al menos en el top 15. Por esta razón, el resultado de la gráfica anterior es inesperado.

multipleChoice18 %>%
  group_by(Q1,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q3)) %>%
  summarise(Count = n()) %>%
  spread(Q1,Count) %>%
  mutate(ratio = Female/Male) %>%
  mutate(highlight_flag = ifelse((Q3 == "Egypt" | Q3 == "Kenya" | Q3 == "Morocco" |
                                   Q3 == "Nigeria" | Q3 == "Tunisia" | Q3 == "South Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Q3,-ratio), y = ratio, fill = ratio)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity") +
  scale_fill_brewer(palette = "Paired") +
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11),
        axis.text.x = element_text(size = 9.5, angle = -90,
                                   hjust = 0 , vjust = 0.5)) +  
  labs(title = "Female to Male ratio", 
       x = "", y = "Ratio", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

## Warning: Removed 13 rows containing missing values (`position_stack()`).

Al comparar esta proporción en África con el resto de los continentes, podemos afirmar que el continente africano ocupa el segundo lugar después de Norte América.

newMultipleChoice %>%
  group_by(Continent,Q1) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  summarise(Count = n()) %>%
  spread(Q1,Count) %>%
  mutate(ratio = Female/Male) %>%
  mutate(highlight_flag = ifelse((Continent == "Africa"), T, F)) %>%
  ggplot(aes(x = reorder(Continent,-ratio), y = ratio, fill = ratio)) +
  geom_bar(aes(fill = highlight_flag), stat = "identity", color = "grey") +
  scale_fill_brewer(palette = "PuBu") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 12),
        axis.text.x = element_text(size = 12)) +  
  labs(title = "Female to Male ratio", 
       x = "", y = "Ratio", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Distribución de edad de los encuestados

Según el informe original, la mayoría de encuestados tienen entre 22 y 29 años, y no se cuenta con la presencia de participantes de más de 69 años. En la gráfica que veremos a continuación, observamos que el rango de edad de la mayoría de los encuestados de género femenino está entre 22 y 24 años, mientras que en las mujeres, el rango de edad más encuestado fue entre 25 y 29 años. Esto parece ser lo más usual en la mayoría de países. Por último, se puede que los hombres alcanzan un mayor rango de edades, a diferencia de las mujeres, donde tenemos grupos de edad en los que no se tiene registro de ellas.

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

ggplot(data = temp, 
       aes(x = Q2, fill = Q1)) +
  geom_bar(data = filter(temp, Q1 == "Male"), aes(y = Count), stat = "identity") +
  geom_bar(data = filter(temp, Q1 == "Female"), aes(y = -1*Count), stat = "identity") +
  scale_y_continuous(breaks = seq(-50,150,50), 
                     labels = as.character(c(seq(50,0,-50), seq(50,150,50)))) + 
  scale_fill_brewer(palette = "Paired") +
  coord_flip() +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text =   element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Age distribution in Africa", 
       x = "Age group (years)", y = "Count", fill = "", 
       caption = "")

Vemos que cerca del 75% de los encuestados de África se encuentran por debajo de los 29 años.

Nivel educativo

Titulos obtenidos

En la gráfica, África es el continente con la mayor cantidad de títulos obtenidos de licenciaturas. En otros continentes, los títulos más comunes son de maestría, sobre todo en Europa la cual lidera la lista de doctorados.

p1 <- newMultipleChoice %>%
  group_by(Continent,Q4) %>%
  filter(!is.na(Q4)) %>%
  summarise(Count = length(Continent)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q4, y = pct, group = Continent)) +
  geom_line(aes(color = Continent), size = 0.5) +
  geom_point(aes(color = Continent), size = 2) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Educational background", 
       x = "", y = "%", fill = "",
       caption = "")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

p2 <- afroCountries %>%
  group_by(Q1,Q4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q4)) %>%
  summarise(Count = length(Q4)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = "", y = pct, fill = Q4)) +
  geom_col(width = 1) + 
  scale_fill_brewer(palette = "Set3") + 
  facet_grid(Q1~.) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        legend.text = element_text(size = 11)) +
  labs(title = "Degree", 
       x = "", y = "", fill = "Degree",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

grid.arrange(p1,p2,ncol = 2) #Esto fue agregado para graficar los datos

En el lado derecho, se hace una comparación entre los títulos obtenidos por hombres y mujeres en África. El porcentaje de mujeres que llegan a masters y doctorados es mayor que en hombres; también se presentan datos sobre hombres que dicen no tener algún título, cuyo porcentaje es mucho mayor que el de mujeres.

Títulos no oficiales

el título no oficial más común en todo el mundo es el de computer science, seguido por ingienería y matemáticas/estadística. El título menos común pertenece a ciencias medio ambientales.

newMultipleChoice %>%
  group_by(Continent,Q5) %>%
  filter(!is.na(Q5)) %>%
  summarise(Count = length(Q5)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(Q5,pct), 
                           size = pct, color = Q5)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 5)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Undergraduate major", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Las mujeres lideran el campo de las ciencias computacionales en África, así como también el de las matemáticas/estadística. Según el informe original, un gran porcentaje de hombres no declaró algún título, lo cual significaría que que son la población con mayor cantidad de personas sin título.

afroCountries %>%
  group_by(Q1,Q5) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q5)) %>%
  summarise(Count = length(Q5)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q5,-pct), y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Undergraduate major", 
       x = "", y = "%", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Experiencia profesional

Rol actual e industria

En África y Asia se encuentra la mayor proporción de estudiantes, miestras que en otros continentes, como Norte América, la mayoría de encuestados trabaja directamente en el mundo de data science. En oceanía observamos que se tiene la misma variedad de roles, pues hay puestos (como marketing analyst) donde no hay personas desempeñándose. A manera general, los papeles más comunes son: estudiante, científico de datos e ingeniero de software.

newMultipleChoice %>%
  group_by(Continent,Q6) %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Continent)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(Q6,pct), 
                           size = 5*pct, color = Q6)) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 8)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12),
        axis.text.y = element_text(size = 10),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Current role", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

afroCountries %>% 
  group_by(Q1,Q6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q6,-pct), y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 12), 
        axis.text.x = element_text(size = 11, angle = -90, 
                                   hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Current role", x = "", y = "%", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

En el gráfico anterior, se aprecia que la cantidad de hombres desempeñándose como estudiantes es mayor al de las mujeres. Vemos que la línea de las mujeres se mantiene más estable que la del otro grupo, esto indicaría que ellas no se ven tan afectadas por el desempleo.

afroCountries %>%
  group_by(Q6,Q3) %>% 
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q6, y = Q3, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 12), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Current role by country", 
       x = "", y = "", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Observamos que el porcentaje de estudiantes ha disminuido este año (para ambos géneros) y es incluso más bajo que todas las ocupaciones combinadas.

# 2017
p1 <- afroCountries17 %>%
  group_by(CurrentJobTitleSelect) %>%
  filter(!is.na(CurrentJobTitleSelect)) %>%
  summarise(Count = length(CurrentJobTitleSelect)) %>%
  ggplot(aes(x = reorder(CurrentJobTitleSelect, Count), y = Count, fill = CurrentJobTitleSelect)) +
  geom_col() +
  coord_flip() + 
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 11),
        legend.position = "none") +
  labs(title = "Current role", subtitle = "2017", x = "", y = "Count", fill = "",
       caption = "")

# 2018
p2 <- afroCountries %>% 
  group_by(Q6) %>%
  filter(Q6 != "Student") %>%
  filter(!is.na(Q6)) %>%
  summarise(Count = length(Q6)) %>%
  ggplot(aes(x = reorder(Q6, Count), y = Count, fill = Q6)) + 
  geom_col() +
  coord_flip()+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 11),
        legend.position = "none") +
  labs(title = "Current role", subtitle = "2018", x = "", y = "Count", fill = "",
       caption = "About us") 

grid.arrange(p1,p2, ncol = 2)

Experiencia en su rol actual

La mayoría de los estudiantes tienen menos de 1 año de experiencia, lo cual no es sorprendente. A excepción de 5 trabajos (Consultant, Chief officer, Project manager, research scientist, sales person), la mayoría (menor al 50%) de los encuestados trabajando en el rol de cada categoria tienen menos de 3 años de experiencia. Data scientists, data journalist y data analysts en su mayoría tienen menos de 1 año de experiencia. Esto es coherente con la edad de los encuestados, ya que la mayoría son jóvenes.

afroCountries %>%
  group_by(Q6,Q8) %>%
  filter(!is.na(Q6)) %>%
  filter(!is.na(Q8)) %>%
  summarise(Count = length(Q8)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q8, y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Experience in current role", 
       x = "Years", y = "", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Uso del Machine learning en la industria

La mayoría de los estudiantes no aprenden acerca de Machine Learning en las escuelas. Incluso si la mayoría de las industrias no implementan el uso del machine learning, algunas ya están empezando a hacerlo.

afroCountries %>%
  group_by(Q6,Q10) %>%
  filter(!is.na(Q10)) %>%
  summarise(Count = length(Q10)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q10, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11), 
        axis.text.y = element_text(size = 10),
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Use of ML in industries", 
       x = "", y = "", fill = "",
       caption = "Machine learning usage")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Actividades en su trabajo

La mayoría realizan análisis y entienden datos para tomar desiciones.La mayoría también realiza cada tarea en elgún punto (algunos más que otros). Los asistentes de investigación y los cientificos realizan más investigaciones que los otros; Database y data engineers construyen y corren data infrastructures; los vendedores y los Data journalists no realizan ninguna de las tareas.

afroCountries %>% 
  select(Q6,Q11_Part_1,Q11_Part_2, Q11_Part_3,Q11_Part_4,Q11_Part_5,Q11_Part_6,Q11_Part_7)%>%
  gather(2:8, key = "questions", value = "Function")%>%
  group_by(Q6,Function)%>%
  filter(!is.na(Function))%>%
  summarise(Count = length(Function))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = Function, y = Q6, fill = pct)) + 
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 9), 
        legend.position = "none") + 
  labs(title = "Day to day function", 
       x = "", y = "", fill = "",
       caption = "Machine learning use")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Experiencia programando

Hace cuánto programan para analizar data

Algunos llevan programando menos de 5 años, especialmente las personas de género femenino; la mayoría de estos a penas empiezan a programar y otros tienen 1 o 2 años de experiencia.

p1 <- afroCountries %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  group_by(Q1,Q24) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  ggplot(aes(x = Q24, y = Count, fill = Q1)) +
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 12), 
        axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Coding experience", 
       x = "", y = "Count", fill = "")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

p2 <- afroCountries %>%
  group_by(Q3,Q24) %>%
  filter(!is.na(Q3)) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q24, y = pct, group = Q3)) +
  geom_point(aes(color = Q3), size = 1.5) + geom_line(aes(color = Q3), size = 0.5) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 15)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 10, angle = -90,hjust = 0,vjust = 0.5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Coding experience by country", 
       x = "", y = "%", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q3'. You can override using the `.groups`
## argument.

grid.arrange(p1,p2,ncol = 2)

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Data engineers tiene el mayor porcentaje de más experiencia en programación (con un rango entre 5 y 10 años) y la mayoría de cientificos tienen entre 3 y 5 años de experiancia.

afroCountries %>%
  group_by(Q6,Q24) %>%
  filter(!is.na(Q24)) %>%
  summarise(Count = length(Q24)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q24, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Coding experience", x = "", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Lenguajes de programación

Lenguaje de programación más usado

El lenguaje de programación más usado por los encuestados depende en gran parte de su posición actual. Python domina los porcentajes, pero la mayoría de estadísticos entre los encuestados usan más que todo R y Database engineers prefieren SQL. Todo esto tiene sentido, ya que todos escogen el lenguaje que más les sirve para su trabajo actual.

afroCountries %>%
  group_by(Q6,Q17) %>%
  filter(!is.na(Q17)) %>%
  summarise(Count = length(Q17)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q17,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = 45, hjust = 1), 
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Most used programming language", x = "", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Todos recomiendan Python para una persona sin experiencia en programación que desee ser un científico de data.

afroCountries %>%
  group_by(Q6,Q18) %>%
  filter(!is.na(Q18)) %>%
  summarise(Count = length(Q18)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q18,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11), 
        axis.text.y = element_text(size = 9), 
        legend.text = element_text(size = 11)) +
  labs(title = "Recommended programming language", x = "", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

incluso aquellos que regularmente utilizan otros lenguajes de programación, igual recomiendan aprender python primero.

afroCountries %>%
  group_by(Q17,Q18) %>%
  filter(!is.na(Q17)) %>%
  filter(!is.na(Q18)) %>%
  summarise(Count = length(Q17)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q17,pct), y = Q18, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+
  geom_text(aes(label = as.character(Count)), color = "white", size = 3) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11),
        axis.text.x = element_text(angle = 35, hjust = 1),
        legend.text = element_text(size = 11)) + 
  labs(title = "Most used vs. Recommended programming languages", 
       x = "Most used", y = "Recommended",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q17'. You can override using the `.groups`
## argument.

Tiempo usado en activamente en programar

afroCountries %>%
  group_by(Q1,Q23) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q23)) %>%
  summarise(Count = length(Q23)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = Q23, y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_fill_brewer(palette = "Set3") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.y = element_text(size = 11), 
        axis.text.x = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) +
  labs(title = "Time spent actively coding", 
       x = "of time", y = "% of people", fill = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
## `geom_line()`: Each group consists of only one observation. ℹ Do you need to
## adjust the group aesthetic?

Students, software engineers, data analysts and data scientists spent the most time coding, from 50 to 74% of their time.

afroCountries %>%
  group_by(Q6,Q23) %>%
  filter(!is.na(Q23)) %>%
  summarise(Count = length(Q23)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q23, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Time spent coding", 
       x = "of time", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

IDEs

El IDE más usado es Jupyter/Ipython.

afroCountries %>% 
  select(Q6,30:45)%>%
  gather(2:16, key = "questions", value = "IDEs")%>%
  group_by(Q6,IDEs)%>%
  filter(!is.na(IDEs))%>%
  summarise(Count = length(IDEs))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(IDEs,-pct), y = Q6, fill = Count)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "IDEs", 
       x = "", y = "", fill = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

## Hosted notebook

La mayoría de encuestados no usan hosted notebook.

afroCountries %>% 
  select(Q6,Q14_Part_1:Q14_Part_11)%>%
  gather(2:12, key = "questions", value = "Hosted_Notebook")%>%
  group_by(Q6,Hosted_Notebook)%>%
  filter(!is.na(Hosted_Notebook))%>%
  summarise(Count = length(Hosted_Notebook))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(Hosted_Notebook,-pct), y = Q6, fill = Count)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Hosted notebook used at school or work", 
       subtitle = "(past 5 years)",
       x = "", y = "", fill = "",
       caption = "")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Herramienta principla para analizar data

La mayoría de los encuestados prefieren usar ambientes locales/hosted para analizar data, es especial estudiantes.

afroCountries %>%
  group_by(Q6,Q12_MULTIPLE_CHOICE) %>%
  filter(!is.na(Q12_MULTIPLE_CHOICE)) %>%
  summarise(Count = length(Q12_MULTIPLE_CHOICE)) %>%
  ggplot(aes(x = reorder(Q12_MULTIPLE_CHOICE,-Count), y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Primary tools for data analysis", 
       x = "", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Eres un cientifico de data?

La mayoría de los encuestados africanos se refieren a ellos mismos como cientificos de data por su experiencia progranado y analizando data.

afroCountries %>%
  filter(Q1 == "Female"| Q1 == "Male") %>%
  group_by(Q1,Q26) %>%
  filter(!is.na(Q26)) %>%
  summarise(Count = length(Q26)) %>%
  ggplot(aes(x = Q26, y = Count, fill = Q1))+
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 8)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.text = element_text(size = 11)) + 
  labs(title = "Think of themself as a data scientist", 
       x = "", y = "Count", fill = "",
       caption = "Personal views")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

La comunidad africana están más seguros de su identidad como data scientist que otros continentesC (el 59.89% respondió sí).

newMultipleChoice %>%
  group_by(Continent,Q26) %>%
  filter(!is.na(Q26)) %>%
  summarise(Count = length(Q26)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q26, y = pct, group = Continent)) +
  geom_line(aes(color = Continent), size = 0.5) +
  geom_point(aes(color = Continent), size = 2) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) + 
  labs(title = "Think of themself as a data scientist", 
       x = "", y = "%", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Métodos de Machine learning usados en la escuela/trabajo

La comnidad africana empezó a usar métodos de machine learning started solo hace poco.

afroCountries %>%
  group_by(Q6,Q25) %>%
  filter(!is.na(Q25)) %>%
  summarise(Count = length(Q25)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q25, y = Q6, fill = Count)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Usage of machine learning at work/school", 
       x = "", y = "",
       caption = "Coding experience")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Productos de Machine learning

afroCountries %>% 
  select(Q6,152:194)%>%
  gather(2:44, key = "questions", value = "ML_Products")%>%
  group_by(Q6,ML_Products)%>%
  filter(!is.na(ML_Products))%>%
  summarise(Count = length(ML_Products))%>%
  mutate(pct =  prop.table(Count)*100)%>% 
  top_n(5,pct) %>%
  ggplot() + 
  geom_point(mapping = aes(x = reorder(ML_Products,-Count), y = Q6, 
                           size = pct, color = ML_Products)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 8, angle = 45, hjust = 1), 
        axis.text.y = element_text(size = 8), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

newMultipleChoice %>% 
  select(Continent,152:194)%>%
  gather(2:44, key = "questions", value = "ML_Products")%>%
  group_by(Continent,ML_Products)%>%
  filter(!is.na(ML_Products))%>%
  summarise(Count = length(ML_Products))%>%
  mutate(pct =  prop.table(Count)*100)%>% 
  top_n(5,Count) %>%
  ggplot() + 
  geom_point(mapping = aes(x = Continent, y = reorder(ML_Products,pct), 
                           size = pct, color = ML_Products)) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 8), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Framework de machine learning más usado

afroCountries %>% 
  select(Q6,Q19_Part_1:Q19_Part_19)%>%
  gather(2:19, key = "questions", value = "ML_Framework")%>%
  group_by(Q6,ML_Framework)%>%
  filter(!is.na(ML_Framework))%>%
  summarise(Count = length(ML_Framework))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(ML_Framework,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Machine learning framework (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

Los encuestados consideran el machine learning una “caja negra”

afroCountries %>%
  group_by(Q1,Q48) %>% 
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q48)) %>%
  filter(!is.na(Q1)) %>%
  ggplot(aes(x = Q1, fill = Q48)) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 10)) +
  scale_fill_brewer(palette = "Set3") +
  coord_flip() +
  labs(title = "Do you consider ML as 'black boxes'?", 
       x = "", y = "", fill = "", caption = "Personal views") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        legend.position = "bottom", 
        axis.text = element_text(size = 12),
        legend.text = element_text(size = 10)) +
  guides(fill = guide_legend(ncol = 1))

Métodos usados para determinar el éxito

newMultipleChoice %>% 
  select(Continent,Q42_Part_1:Q42_Part_5)%>%
  gather(2:6, key = "questions", value = "Metrics")%>%
  group_by(Continent,Metrics)%>%
  filter(!is.na(Metrics))%>%
  summarise(Count = length(Metrics))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(Metrics,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.5, size = 4, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 20))+
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.position = "none",
        legend.text = element_text(size = 11)) + 
  labs(title = "Metrics used by organizations", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Sobre data

Tipo de data más usado

La mayoría utiliza data numérico, pero los data scientists utilizan tabular data más.

afroCountries %>% 
  select(Q6,Q32) %>%
  group_by(Q6,Q32) %>%
  filter(!is.na(Q32)) %>%
  summarise(Count = length(Q32)) %>%
  mutate(pct = round(prop.table(Count)*100,2)) %>%
  ggplot(aes(x = reorder(Q32,-Count), y = Q6, fill = pct)) +
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "none", 
        axis.text.y = element_text(size = 11),
        axis.text.x = element_text(size = 12, angle = -90, hjust = 0, vjust = 0.5),
        legend.text = element_text(size = 11)) +
  labs(title = "Most used data types", 
       x = "Type of data", y = "", fill = "", 
       caption = "Data")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

## Dónde encontrar las bases de datos públicas

afroCountries %>% 
  select(Q1,266:276) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:12, key = "questions", value = "DataSource")%>%
  group_by(Q1,DataSource) %>%
  filter(!is.na(DataSource)) %>%
  summarise(Count = length(DataSource))%>%
  ggplot(aes(x = reorder(DataSource,-Count), y = Count, fill = Q1)) + 
  geom_col() +
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20)) +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Sources used to get public datasets", 
       x = "", y = "Count", fill = "",
       caption = "Data")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

# Otras herramientas y servicios

Servicios de cloud computing en el trabajo/escuela

afroCountries %>% 
  select(Q6,Q15_Part_1:Q15_Part_7)%>%
  gather(2:8, key = "questions", value = "Cloud_services")%>%
  group_by(Q6,Cloud_services)%>%
  filter(!is.na(Cloud_services))%>%
  summarise(Count = length(Cloud_services))%>%
  mutate(pct =  prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(Cloud_services,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 9),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Cloud computing services at work/school", 
       x = "", y = "", fill = "",
       caption = "")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

## Cloud computing products

afroCountries %>% 
  select(Q6,Q27_Part_1:Q27_Part_20) %>%
  gather(2:21, key = "questions", value = "cloud")%>%
  group_by(Q6,cloud)%>%
  filter(!is.na(cloud))%>%
  filter(!is.na(Q6))%>%
  summarise(Count = length(cloud))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = reorder(cloud,-Count), y = Q6, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  geom_text(aes(label = as.character(Count)), 
            hjust = 0.5,vjust = 0.5, size = 2.5, color = "white") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Cloud computing products (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Cloud computing products")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

## Herramientas de visualización más usadas

afroCountries %>% 
  select(Q1,196:223) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:29, key = "questions", value = "RDB_Products") %>%
  group_by(Q1,RDB_Products) %>%
  filter(!is.na(RDB_Products))%>%
  filter(!is.na(Q1))%>%
  summarise(Count = length(RDB_Products))%>%
  ggplot(aes(x = reorder(RDB_Products,-Count), y = Count, fill = Q1)) +
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +  
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 8, 
                                   angle = -90, hjust = 0, vjust = 0.5),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Relational database products (past 5 years)", 
       x = "", y = "Count", fill = "",
       caption = "Relational database")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

Grandes productos data/analytics

afroCountries %>% 
  select(Q1,225:249) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  gather(2:26, key = "questions", value = "BigData_Products") %>%
  group_by(Q1,BigData_Products) %>%
  filter(!is.na(BigData_Products)) %>%
  filter(!is.na(Q1)) %>%
  summarise(Count = length(BigData_Products))%>%
  ggplot(aes(x = reorder(BigData_Products,-Count), y = Count, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text.x = element_text(size = 9, angle = -90, hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Big data and analytics tools (past 5 years)", 
       x = "", y = "Count", fill = "",
       caption = "Big data and analytics tools")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

newMultipleChoice %>% 
  select(Continent,225:249) %>%
  gather(2:26, key = "questions", value = "BigData_Products") %>%
  group_by(Continent,BigData_Products) %>%
  filter(!is.na(BigData_Products)) %>%
  summarise(Count = length(BigData_Products))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(BigData_Products,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.5, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Big data and analytics tools (past 5 years)", 
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

# Educación

Porcentaje de educación en machine learning y ciencia de datos.

p1 <- multipleChoice18 %>%
  select(Q1,Q35_Part_1) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_1, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Self-taught", x = "", y = "%")
  
p2 <- multipleChoice18 %>%
  select(Q1,Q35_Part_2) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_2, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Online courses", x = "", y = "%")

p3 <- multipleChoice18 %>%
  select(Q1,Q35_Part_3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_3, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Work", x = "", y = "%")
  
p4 <- multipleChoice18 %>%
  select(Q1,Q35_Part_4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_4, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "University", x = "", y = "%")

p5 <- multipleChoice18 %>%
  select(Q1,Q35_Part_5) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "",y = Q35_Part_5, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Kaggle competitions", x = "", y = "%")
  
p6 <- multipleChoice18 %>%
  select(Q1,Q35_Part_6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  ggplot(aes(x = "", y = Q35_Part_6, fill = Q1)) + 
  geom_boxplot() +
  scale_fill_brewer(palette = "Paired") +
  theme(plot.title = element_text(size = 13, hjust = 0.5),
        legend.text = element_text(size = 9),
        legend.title = element_blank()) +
  labs(title = "Other", x = "", y = "%")

grid.arrange(p1,p2, p3, p4, p5, p6, ncol = 3)

## Warning: Removed 1439 rows containing non-finite values (`stat_boxplot()`).
## Removed 1439 rows containing non-finite values (`stat_boxplot()`).
## Removed 1439 rows containing non-finite values (`stat_boxplot()`).
## Removed 1439 rows containing non-finite values (`stat_boxplot()`).
## Removed 1439 rows containing non-finite values (`stat_boxplot()`).
## Removed 1439 rows containing non-finite values (`stat_boxplot()`).

¿Qué piensan acerca de…?

¿Qué piensan acerca de MOOCs/in-person bootcamp?

Un gran porcentaje de encuestados están a favor del aprendizaje online, especialmente los profesionales.

afroCountries %>%
  group_by(Q6,Q39_Part_1) %>%
  filter(!is.na(Q39_Part_1)) %>%
  summarise(Count = length(Q39_Part_1)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q39_Part_1,-Count), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "Online learning vs. Traditional institution", 
       x = "", y = "",
       caption = "")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

afroCountries %>%
  group_by(Q6,Q39_Part_2) %>%
  filter(!is.na(Q39_Part_2)) %>%
  summarise(Count = length(Q39_Part_2)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = reorder(Q39_Part_2,-pct), y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = as.character(Count)), color = "white", size = 3.5) + 
  theme(legend.position = "none",
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text = element_text(size = 11), 
        axis.text.y = element_text(size = 9),
        legend.text = element_text(size = 11)) + 
  labs(title = "In-person bootcamp vs. Traditional institution", 
       x = "", y = "",
       caption = "")

## `summarise()` has grouped output by 'Q6'. You can override using the `.groups`
## argument.

## Data science expertise

afroCountries %>% 
  select(Q1,Q40) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  group_by(Q1,Q40) %>%
  filter(!is.na(Q40)) %>%
  filter(!is.na(Q1)) %>%
  summarise(Count = length(Q40))%>%
  mutate(pct = prop.table(Count)*100) %>%
  ggplot(aes(x = reorder(Q40,-pct), y = pct, fill = Q1)) + 
  geom_col() + 
  scale_fill_brewer(palette = "Paired") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        axis.text = element_text(size = 12),
        legend.position = "top",
        legend.text = element_text(size = 11)) +
  labs(title = "Independent projects vs. Academic achievements", 
       x = "", y = "Count", fill = "",
       caption = "Expertise in data science")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.

45,49% de los africanos encuestados encuentran los proyectos independientes más importantes que los logros académicos.

newMultipleChoice %>% 
  group_by(Continent, Q40)%>%
  filter(!is.na(Q40))%>%
  summarise(Count = length(Q40))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = reorder(Q40,pct), fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Expertise in data science",
       subtitle = "Independent projects vs. academic achievements",
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Acerca del dinero

afroCountries %>% 
  group_by(Q1,Q9)%>%
  filter(Q1 == "Female"|Q1 == "Male")%>%
  filter(!is.na(Q9))%>%
  summarise(Count = length(Q9))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Q9, y = pct, group = Q1)) +
  geom_point(aes(color = Q1), size = 2) + geom_line(aes(color = Q1), size = 1) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  
  theme(plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, vjust = 0.5, hjust = 0),
        axis.text.y = element_text(size = 12), 
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 11)) + 
  scale_fill_brewer(palette = "Paired") +
  labs(title = "Yearly compensation", 
       x = "$", y = "%", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1'. You can override using the `.groups`
## argument.
## `geom_line()`: Each group consists of only one observation. ℹ Do you need to
## adjust the group aesthetic?

afroCountries %>%
  group_by(Q1,Q9,Q3) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q3, y = Q9, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) + 
  facet_grid(Q1~.) +
  coord_flip() +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,  
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Yearly compensation", 
       x = "", y = "$", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

A primera vista, los PhDs hombres son mejor pagados que su contra parte femenina.

afroCountries %>%
  group_by(Q1,Q9,Q4) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q4)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q4, y = Q9, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) + 
  facet_grid(Q1~.) +
  coord_flip() +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 35)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90,  
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) + 
  labs(title = "Yearly compensation by degree", 
       x = "", y = "$", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

afroCountries %>%
  group_by(Q1,Q9,Q6) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q6)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q9, y = Q6, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
  facet_grid(Q1~.) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, 
                                   hjust = 0, vjust = 0.5),
        axis.text.y = element_text(size = 7), 
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation vs. current role", 
       x = "$", y = "", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

Para ganar más dinero, se tendría que ser un data scientist, statistician o un data engineer.

Los años de experiencia parecen tener poca importancia con respecto a qué tanto ganan las peronas (como data scientist específicamente), sin embargo, parece ser que el género sí tiene gran importancia.

afroCountries %>%
  group_by(Q1,Q9,Q8) %>%
  filter(Q1 == "Female" | Q1 == "Male") %>%
  filter(!is.na(Q8)) %>%
  filter(!is.na(Q9)) %>%
  summarise(Count = length(Q9)) %>%
  mutate(pct = round(prop.table(Count)*100,2))%>%
  ggplot(aes(x = Q9, y = Q8, fill = pct)) +
  geom_tile(size = 0.5, show.legend = TRUE) +
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), color = "white", size = 2) +
  facet_grid(Q1~.) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5)) +
  scale_y_discrete(labels = function(x) str_wrap(x, width = 10)) +
  theme(legend.position = "none", 
        plot.title = element_text(size = 15, hjust = 0.5), 
        axis.text.x = element_text(size = 11, angle = -90, hjust = 0),
        axis.text.y = element_text(size = 11), 
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation by gender and experience in current role", 
       x = "$", y = "Years of experience", fill = "",
       caption = "About us")

## `summarise()` has grouped output by 'Q1', 'Q9'. You can override using the
## `.groups` argument.

A la mayoría de los encuestados africanos les pagan menos de 10’000$; los norte americanos y las personas de oceania tienen la mayor porción de personas a las que les pgan mas de 100’000$. En Europa, la mayoría está en el rango de 0-60’000$.

newMultipleChoice %>% 
  group_by(Continent, Q9)%>%
  filter(!is.na(Q9))%>%
  summarise(Count = length(Q9))%>%
  mutate(pct = prop.table(Count)*100)%>%
  ggplot(aes(x = Continent, y = Q9, fill = pct)) + 
  geom_tile(stat = "identity") + 
  scale_fill_gradient(low = "salmon1", high = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", pct)), 
            hjust = 0.5,vjust = 0.25, size = 3, color = "white") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 5))+
  scale_y_discrete(labels = function(x) str_wrap(x, width = 30))+
  theme(plot.title = element_text(size = 15, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        legend.position = "none",
        legend.text = element_text(size = 11)) +
  labs(title = "Yearly compensation",
       x = "", y = "", fill = "",
       caption = "Africa and the world")

## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

Afrikagglers

Laura Vesga

2023-02-26