library(tidyverse)
library(DT) # datatable
library(stringr)
freeformResponses = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/freeformResponses.csv")

multipleChoiceResponses = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/multipleChoiceResponses.csv")

schema = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/schema.csv")
# colnames(multipleChoiceResponses)
nrow(multipleChoiceResponses)
## [1] 16716
# table(multipleChoiceResponses$Country)


Data Cleaning: China /Taiwan (overwrite the dataset)

multipleChoiceResponses = multipleChoiceResponses %>%
  mutate(Country = ifelse(Country == "Republic of China", "Taiwan", 
                            ifelse(Country == "People 's Republic of China", "China",
                                   Country)))
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.

coolYellow = "#F1C40F" 
coolLightGreen = "#7aaa0a"


A. Which country has the youngest data scientists?

1. Data Preparation
multipleChoiceResponses %>%
  filter(!is.na(Country)) %>%  
  group_by(Country) %>%
  summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
  # ungroup() %>%
  mutate(Country = reorder(Country,AgeMedian)) %>%  # order Country by AgeMedian
  arrange(AgeMedian) %>%  # default is ascending
  head(15) 
## # A tibble: 15 x 2
##    Country     AgeMedian
##    <fct>           <dbl>
##  1 Indonesia        24  
##  2 Vietnam          24  
##  3 China            25  
##  4 India            25  
##  5 Pakistan         25  
##  6 Belarus          26  
##  7 Kenya            26  
##  8 Philippines      26.5
##  9 Malaysia         27  
## 10 Taiwan           27  
## 11 Ukraine          27  
## 12 Iran             28  
## 13 Russia           28  
## 14 Turkey           28  
## 15 Egypt            28.5
multipleChoiceResponses %>%
  filter(!is.na(Country)) %>%  
  group_by(Country) %>%
  summarise(AgeMedian = median(Age,na.rm = TRUE), AgeMean = round(mean(Age, na.rm = TRUE), digit = 1)) %>%
  # ungroup() %>%
  # mutate(Country = reorder(Country,AgeMedian)) %>%  # order Country by AgeMedian
  arrange(AgeMedian) %>%  # default is ascending. !not arrange(AgeMedian, AgeMean)
  datatable(filter = 'top', 
            #options = list(pageLength = 15),
             caption = htmltools::tags$caption(
               style = 'caption-side: bottom; text-align: center;',
                'Table 1. ', htmltools::em('Median and Mean Ages of Respondents by Country.')
    )
  )
multipleChoiceResponses %>% 
  filter(Country == 'China') %>%
  summarise(meanAge = mean(Age, na.rm = T), medianAge = median(Age, na.rm = T))
## # A tibble: 1 x 2
##   meanAge medianAge
##     <dbl>     <int>
## 1    27.0        25


datatable: Inspired from https://www.kaggle.com/ikleiman/data-scientists-salaries-around-the-world

Documentation from https://rstudio.github.io/DT/


2. Barplot Initiating
multipleChoiceResponses %>%
  filter(!is.na(Country)) %>%
  group_by(Country) %>%
  summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
  # ungroup() %>%
  mutate(Country = reorder(Country,AgeMedian)) %>%
  arrange(AgeMedian) %>%
  head(10) %>%  # 
  
  ggplot(aes(x = Country,y = AgeMedian)) +
  geom_bar(stat = 'identity', fill = coolLightGreen) 

stat = 'identity': The heights of the bars commonly represent one of two things: either a count of cases in each group, or the values in a column of the data frame. By default, geom_bar uses stat=“bin”. This makes the height of each bar equal to the number of cases in each group, and it is incompatible with mapping values to the y aesthetic. If you want the heights of the bars to represent values in the data, use stat=“identity” and map a value to the y aesthetic.

geom_bar, stat = ‘identity’


3. Adjusting and Polishing
nsample = 10

multipleChoiceResponses %>%
  filter(!is.na(Country)) %>%
  group_by(Country) %>%
  summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
  # ungroup() %>%
  mutate(Country = reorder(Country,AgeMedian)) %>%
  arrange(AgeMedian) %>%
  head(nsample) %>%  # elegant way
  
  ggplot(aes(x = Country,y = AgeMedian, label = AgeMedian)) +
  geom_bar(stat = 'identity', fill = coolLightGreen) +
  geom_label(hjust = 0.5, vjust = 0.5, size = 3.3) + 
  labs(x = "", y = "Median age of the responders", title = str_c("Top ", nsample, " ", "Country With the Youngest Data Scientist Talent" )) + 
  coord_flip() + 
  # theme_bw()   # dark-on-light, work better for presentations
  theme_gray()


B. Gender Distribution

1. Data Preparation
multipleChoiceResponses %>%
  filter(!is.na(GenderSelect)) %>%  
  group_by(GenderSelect) %>%
  summarise(count = n()) %>%  # like table in base R to deal with categorical data
  ungroup() %>%
  mutate(GenderSelect = reorder(GenderSelect, count)) %>%  # order Country by AgeMedian
  arrange(desc(count))   # default is ascending
## # A tibble: 4 x 2
##   GenderSelect                                      count
##   <fct>                                             <int>
## 1 Male                                              13610
## 2 Female                                             2778
## 3 A different identity                                159
## 4 Non-binary, genderqueer, or gender non-conforming    74