library(tidyverse)
library(DT) # datatable
library(stringr)
freeformResponses = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/freeformResponses.csv")
multipleChoiceResponses = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/multipleChoiceResponses.csv")
schema = read_csv("/Users/apple/Desktop/YuqingRWork/Kaggle/kaggle-survey-2017/schema.csv")
# colnames(multipleChoiceResponses)
nrow(multipleChoiceResponses)
## [1] 16716
# table(multipleChoiceResponses$Country)
multipleChoiceResponses = multipleChoiceResponses %>%
mutate(Country = ifelse(Country == "Republic of China", "Taiwan",
ifelse(Country == "People 's Republic of China", "China",
Country)))
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
coolYellow = "#F1C40F"
coolLightGreen = "#7aaa0a"
multipleChoiceResponses %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
# ungroup() %>%
mutate(Country = reorder(Country,AgeMedian)) %>% # order Country by AgeMedian
arrange(AgeMedian) %>% # default is ascending
head(15)
## # A tibble: 15 x 2
## Country AgeMedian
## <fct> <dbl>
## 1 Indonesia 24
## 2 Vietnam 24
## 3 China 25
## 4 India 25
## 5 Pakistan 25
## 6 Belarus 26
## 7 Kenya 26
## 8 Philippines 26.5
## 9 Malaysia 27
## 10 Taiwan 27
## 11 Ukraine 27
## 12 Iran 28
## 13 Russia 28
## 14 Turkey 28
## 15 Egypt 28.5
multipleChoiceResponses %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
summarise(AgeMedian = median(Age,na.rm = TRUE), AgeMean = round(mean(Age, na.rm = TRUE), digit = 1)) %>%
# ungroup() %>%
# mutate(Country = reorder(Country,AgeMedian)) %>% # order Country by AgeMedian
arrange(AgeMedian) %>% # default is ascending. !not arrange(AgeMedian, AgeMean)
datatable(filter = 'top',
#options = list(pageLength = 15),
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center;',
'Table 1. ', htmltools::em('Median and Mean Ages of Respondents by Country.')
)
)
multipleChoiceResponses %>%
filter(Country == 'China') %>%
summarise(meanAge = mean(Age, na.rm = T), medianAge = median(Age, na.rm = T))
## # A tibble: 1 x 2
## meanAge medianAge
## <dbl> <int>
## 1 27.0 25
datatable: Inspired from https://www.kaggle.com/ikleiman/data-scientists-salaries-around-the-world
Documentation from https://rstudio.github.io/DT/
multipleChoiceResponses %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
# ungroup() %>%
mutate(Country = reorder(Country,AgeMedian)) %>%
arrange(AgeMedian) %>%
head(10) %>% #
ggplot(aes(x = Country,y = AgeMedian)) +
geom_bar(stat = 'identity', fill = coolLightGreen)
stat = 'identity': The heights of the bars commonly represent one of two things: either a count of cases in each group, or the values in a column of the data frame. By default, geom_bar uses stat=“bin”. This makes the height of each bar equal to the number of cases in each group, and it is incompatible with mapping values to the y aesthetic. If you want the heights of the bars to represent values in the data, use stat=“identity” and map a value to the y aesthetic.
nsample = 10
multipleChoiceResponses %>%
filter(!is.na(Country)) %>%
group_by(Country) %>%
summarise(AgeMedian = median(Age,na.rm = TRUE)) %>%
# ungroup() %>%
mutate(Country = reorder(Country,AgeMedian)) %>%
arrange(AgeMedian) %>%
head(nsample) %>% # elegant way
ggplot(aes(x = Country,y = AgeMedian, label = AgeMedian)) +
geom_bar(stat = 'identity', fill = coolLightGreen) +
geom_label(hjust = 0.5, vjust = 0.5, size = 3.3) +
labs(x = "", y = "Median age of the responders", title = str_c("Top ", nsample, " ", "Country With the Youngest Data Scientist Talent" )) +
coord_flip() +
# theme_bw() # dark-on-light, work better for presentations
theme_gray()
multipleChoiceResponses %>%
filter(!is.na(GenderSelect)) %>%
group_by(GenderSelect) %>%
summarise(count = n()) %>% # like table in base R to deal with categorical data
ungroup() %>%
mutate(GenderSelect = reorder(GenderSelect, count)) %>% # order Country by AgeMedian
arrange(desc(count)) # default is ascending
## # A tibble: 4 x 2
## GenderSelect count
## <fct> <int>
## 1 Male 13610
## 2 Female 2778
## 3 A different identity 159
## 4 Non-binary, genderqueer, or gender non-conforming 74