library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(kableExtra)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, %||%, as_function, flatten, flatten_chr, flatten_dbl,
##     flatten_int, flatten_lgl, invoke, list_along, modify, prepend,
##     rep_along, splice
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(stringr)

Functions

## Takes a column and creates summary without nulls containing N sum and proportions
chooseOne = function(question){
    exp_df %>%
        filter(!UQ(sym(question)) == "") %>% 
        dplyr::group_by_(question) %>% 
    # Count how many respondents selected each option
        dplyr::summarise(count = n()) %>% 
    # Calculate what percent of respondents selected each option
        dplyr::mutate(percent = (count / sum(count)) * 100) %>% 
    # Arrange the counts in descending order
        dplyr::arrange(desc(count)) 
}

## Same as Choose one except works for answers where user could choose multiple answers per question
chooseMultiple = function(question,df){

  df %>% 
    # Remove any rows where the respondent didn't answer the question
    dplyr::filter(!UQ(sym(question)) == "") %>%
    # Remove all columns except question
    dplyr::select(question) %>% 
    # Add a column with the initial number of respondents to question
    dplyr::mutate(totalCount = n()) %>% 
    # Split multiple answers apart at the comma, but ignore commas inside parentheses
    dplyr::mutate(selections = strsplit(as.character(UQ(sym(question))), 
                                 '\\([^)]+,(*SKIP)(*FAIL)|,\\s*', perl = TRUE)) %>%
    # Split answers are now nested, need to unnest them
    unnest(selections) %>% 
    # Group by the selected responses to the question
    dplyr::group_by(selections) %>% 
   # Count how many respondents selected each option
    dplyr::summarise(totalCount = max(totalCount),
              count = n()) %>% 
    # Calculate what percent of respondents selected each option
    dplyr::mutate(percent = (count / totalCount) * 100) %>% 
    # Arrange the counts in descending order
    dplyr::arrange(desc(count))
}        


## Slight modification to choosone,
Academic_exploration=function(question,df){
     df %>%
        filter(!UQ(sym(question)) == "") %>% 
        dplyr::group_by_(question) %>% 
    # Count how many respondents selected each option
        dplyr::summarise(count = n()) %>% 
    # Calculate what percent of respondents selected each option
        dplyr::mutate(percent = (count / sum(count)) * 100) %>% 
    # Arrange the counts in descending order
        dplyr::arrange(desc(count)) 
}



## Takes a vector and creates a percantage column
proportion_function <- function(vec){
    vec/sum(vec)*100
}

##Creates intervals for column data, that matches intervals to foreign data intervals 

create_breaks <- function(dfcolumn,breaks,labels)
    {
    dfcolumn <- as.numeric(dfcolumn)
    dfcolumn <- cut(dfcolumn,breaks=breaks,labels=labels,right=FALSE)
    }


##Failed funnction
#identity_plots_45 <- function(df,x,y,fill){ggplot(df, aes(x = var(x),y=var(y), fill = var(fill))) + 
 #     geom_bar(stat="identity")+
#}

Load in data and create summaries for all the columns with functional

## Warning: Missing column names filled in: 'X229' [229], 'X230' [230]
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
## # A tibble: 2,895 x 2
##    TimeSpentStudying ProveKnowledgeSelect
##    <chr>             <chr>               
##  1 <NA>              <NA>                
##  2 <NA>              <NA>                
##  3 <NA>              <NA>                
##  4 <NA>              <NA>                
##  5 <NA>              <NA>                
##  6 <NA>              <NA>                
##  7 <NA>              <NA>                
##  8 <NA>              <NA>                
##  9 <NA>              <NA>                
## 10 <NA>              <NA>                
## # ... with 2,885 more rows

How did you first learn machine learning

## # A tibble: 6 x 3
##   FirstTrainingSelect                         count percent
##   <chr>                                       <int>   <dbl>
## 1 University courses                            975   34.4 
## 2 Self-taught                                   777   27.4 
## 3 Online courses (coursera, udemy, edx, etc.)   756   26.7 
## 4 Work                                          250    8.82
## 5 Kaggle competitions                            44    1.55
## 6 Other                                          33    1.16

Takeaway

How important is university?

Takeaways

Education data

Below Image taken from Burtchwood Study on Data Scientists

Burtchwood Study on Data Scientists

Burtchwood Study on Data Scientists

Takeaways

  • over 70% of data scientists have at least a masters degree!!

Display by Major

Takeaways

  • Computer Science leads the field with over 30%

  • Close behind is math and stat followed by Electrical engineering

  • About 6.5% of data scientists come from social science backgrounds

What are the Gender and Age demographics

## # A tibble: 4 x 2
##   GenderSelect percent
##   <chr>          <dbl>
## 1 Male          84.0  
## 2 Female        14.4  
## 3 other          1.14 
## 4 Non-binary     0.485

For comparison Burthwoods gender

Burtchwood Study on Data Scientists

Burtchwood Study on Data Scientists

  • “By maintaining such strong relationships with candidates and clients, Burtch Works has the unique opportunity to examine hiring and compensation trends over time, and publishes several highly-anticipated studies each year that investigate demographic and compensation data for predictive analytics, marketing research, and data science professionals. The Burtch Works Studies provide an exceptional vantage point on compensation for these professionals across the country, and contain critical information both for individuals mapping their career strategy, and for hiring managers hoping to recruit and retain outstanding personnel to their teams.”
## # A tibble: 4 x 3
##   GenderSelect                                      count percent
##   <chr>                                             <int>   <dbl>
## 1 Male                                               2422  84.0  
## 2 Female                                              416  14.4  
## 3 A different identity                                 33   1.14 
## 4 Non-binary, genderqueer, or gender non-conforming    14   0.485
## Warning in create_breaks(exp_df$Age, c(1, 22.1, 28.1, 35.1, 41.1, 49.1, :
## NAs introduced by coercion

Takeaways

  • Only about 18% of data scientists are over 47 years old
  • There are as many 18-22 year old as there are 62+
  • 23-38 encompasses nearly 60% of the field
  • There is an enormous gender bias in the composition of Males/Females with males outnumbering females at nearly 4-1

Explore how long Employees have been in the data science field

## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
## # A tibble: 3 x 3
##   burtchworks_tenure percent_Burtch_works tenure       
##                <dbl>                <dbl> <fctr>       
## 1                150                 38.0 0-5          
## 2                120                 30.4 6-10         
## 3                125                 31.6 " 10 + Years"

Comparing the datasets

  • Our dataset seems to have less experience than the BurtchWorks
  • We should note that observations may be somewhat influenced by a younger grouping of Data scientists

Explore Employment Status and see how it effects remote login statistics

## # A tibble: 4 x 3
##   EmploymentStatus                                     count percent
##   <chr>                                                <int>   <dbl>
## 1 Employed full-time                                    2348  81.1  
## 2 Independent contractor, freelancer, or self-employed   332  11.5  
## 3 Employed part-time                                     187   6.46 
## 4 Retired                                                 28   0.967

Takeaways

  • Unsurprisingly people working independently seem to work remotely w over 50% of those employed this way always working remotely
  • Full time employees don’t work remotely all that much. Over 35% of them Rarely work remotely, and very few almost always work remotely
  • Part time employees seem pretty evenly split

Employer catagories

## [[1]]
## # A tibble: 7 x 3
##   FormalEducation                                             count perce~
##   <chr>                                                       <int>  <dbl>
## 1 Master's degree                                              1280 44.5  
## 2 Doctoral degree                                               765 26.6  
## 3 Bachelor's degree                                             627 21.8  
## 4 Professional degree                                           100  3.47 
## 5 Some college/university study without earning a bachelor's~    76  2.64 
## 6 I did not complete any formal education past high school       22  0.764
## 7 I prefer not to answer                                          9  0.313
## 
## [[2]]
## # A tibble: 15 x 3
##    MajorSelect                                                count perce~
##    <chr>                                                      <int>  <dbl>
##  1 Computer Science                                             836 32.2  
##  2 Mathematics or statistics                                    500 19.3  
##  3 Electrical Engineering                                       264 10.2  
##  4 Engineering (non-computer focused)                           226  8.71 
##  5 Physics                                                      181  6.98 
##  6 Other                                                        138  5.32 
##  7 Information technology, networking, or system administrat~   119  4.59 
##  8 A social science                                             104  4.01 
##  9 Biology                                                       66  2.54 
## 10 Management information systems                                44  1.70 
## 11 Psychology                                                    38  1.46 
## 12 A health science                                              37  1.43 
## 13 A humanities discipline                                       24  0.925
## 14 Fine arts or performing arts                                  11  0.424
## 15 I never declared a major                                       6  0.231
## 
## [[3]]
## # A tibble: 6 x 3
##   Tenure                             count percent
##   <chr>                              <int>   <dbl>
## 1 3 to 5 years                         895  31.6  
## 2 More than 10 years                   652  23.0  
## 3 1 to 2 years                         590  20.8  
## 4 6 to 10 years                        515  18.2  
## 5 Less than a year                     173   6.11 
## 6 I don't write code to analyze data     8   0.282
## 
## [[4]]
## # A tibble: 6 x 3
##   FirstTrainingSelect                         count percent
##   <chr>                                       <int>   <dbl>
## 1 University courses                            975   34.4 
## 2 Self-taught                                   777   27.4 
## 3 Online courses (coursera, udemy, edx, etc.)   756   26.7 
## 4 Work                                          250    8.82
## 5 Kaggle competitions                            44    1.55
## 6 Other                                          33    1.16
## 
## [[5]]
## # A tibble: 25 x 3
##    EmployerIndustry   count percent
##    <chr>              <int>   <dbl>
##  1 Academic             638   23.8 
##  2 Technology           485   18.1 
##  3 Financial            243    9.07
##  4 Other                216    8.07
##  5 Mix of fields        206    7.69
##  6 Internet-based       158    5.90
##  7 Government           155    5.79
##  8 Manufacturing         93    3.47
##  9 CRM/Marketing         84    3.14
## 10 Telecommunications    67    2.50
## # ... with 15 more rows
## 
## [[6]]
## # A tibble: 27 x 3
##    EmployerSize             count percent
##    <chr>                    <int>   <dbl>
##  1 10,000 or more employees   454   19.9 
##  2 100 to 499 employees       379   16.6 
##  3 1,000 to 4,999 employees   345   15.1 
##  4 20 to 99 employees         340   14.9 
##  5 500 to 999 employees       171    7.48
##  6 10 to 19 employees         148    6.47
##  7 5,000 to 9,999 employees   146    6.38
##  8 Fewer than 10 employees    134    5.86
##  9 I don't know                94    4.11
## 10 I prefer not to answer      24    1.05
## # ... with 17 more rows
## 
## [[7]]
## # A tibble: 22 x 3
##    EmployerSizeChange       count percent
##    <chr>                    <int>   <dbl>
##  1 Increased slightly         828  37.1  
##  2 Stayed the same            710  31.8  
##  3 Increased significantly    453  20.3  
##  4 Decreased slightly         135   6.05 
##  5 Decreased significantly     58   2.60 
##  6 1,000 to 4,999 employees     7   0.314
##  7 Fewer than 10 employees      7   0.314
##  8 500 to 999 employees         5   0.224
##  9 10,000 or more employees     4   0.179
## 10 20 to 99 employees           4   0.179
## # ... with 12 more rows

Use top employment industries (Academic Financial Government Technology)

Create function to map a grid of our subsetted data

### Someone get me a function here to create factor of 1st column!!!
### Also one for ggplot if possible?




## Explore subset but first create function 
explore_data_science <- function(field,academic_indus){
Academic_Size <- Academic_exploration("EmployerSize",Academic_indus)
Academic_Size$EmployerSize<- factor(Academic_Size$EmployerSize, levels=Academic_Size$EmployerSize)

Academic_SizeChange <- Academic_exploration("EmployerSizeChange",Academic_indus)
Academic_SizeChange$EmployerSizeChange<- factor(Academic_SizeChange$EmployerSizeChange, levels=Academic_SizeChange$EmployerSizeChange)

Academic_MLTime <- Academic_exploration("EmployerMLTime",Academic_indus)
Academic_MLTime$EmployerMLTime<- factor(Academic_MLTime$EmployerMLTime, levels=Academic_MLTime$EmployerMLTime)

Academic_SearchMethod <- Academic_exploration("EmployerSearchMethod",Academic_indus)
Academic_SearchMethod[,1]= c("friend/Family", "Internal Recruiter", "Other Way", "Thru Website", "General Job Board", "Career Fair", "Tech Job board", "Headhunter")
Academic_SearchMethod$EmployerSearchMethod<- factor(Academic_SearchMethod$EmployerSearchMethod, levels=Academic_SearchMethod$EmployerSearchMethod)

plot_1 <- ggplot(Academic_Size, aes(x = EmployerSize,y=percent, fill = EmployerSize)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_2 <- ggplot(Academic_SizeChange, aes(x = EmployerSizeChange,y=percent, fill = EmployerSizeChange)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_3 <- ggplot(Academic_MLTime, aes(x = EmployerMLTime,y=percent, fill = EmployerMLTime)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_4 <- ggplot(Academic_SearchMethod, aes(x = EmployerSearchMethod,y=percent, fill = EmployerSearchMethod)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
      coord_flip()

   
 grid.arrange(plot_1,plot_2,plot_3,plot_4, top=paste("Data scientists employed in",field))

 
}

Now we can Explore all the Major Data science employment fields

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Academic"))
explore_data_science("Academic",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Technology"))
explore_data_science("Technology",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Financial"))
explore_data_science("Financial",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Government"))
explore_data_science("Government",Academic_indus)

Likely useless commented out data that i misinterpreted

##Gender
# gender_1 <- c(males=163,females=40)
# gender_2 <- c(males=152,females=51)
# proportion_function(gender_1)
# proportion_function(gender_2)


# ##AGE
# proportion_function(average_age_1) 
# proportion_function(average_age_2)
# average_age_1 <-c("0-18"=54,111,145,129,92,75,55)
# average_age_2 <-c("0-18"=58,106,137,136,92,76,56)

##Education

# education_1 <- c(45,104,260)
# education_2 <- c(46,110,240)
# education_1 <- c("no_college"=45, "college"=104, "grad school"=260)
# education_2 <- c("no_college"=46, "college"=110, "grad school"=240)
# 
##Salary
# 92.798
# salary_1 <- c(65,110,166,179)
# salary_2 <- c(62,111,170,192)


# identity_plots_45(majors,x='MajorSelect',y='percent',fill='MajorSelect')



##4th graph job fucntions coudlnt get it in
    
#Jobfunctionsselct
#    current_job_JobFunction<- Academic_exploration("JobFunctionSelect",current_job)
#    current_job_JobFunction <- current_job_JobFunction %>% 
 #       filter(JobFunctionSelect%in%c("Build prototypes to explore applying machine learning to new areas","Analyze and understand data to #influence product or business decisions","Research that advances the state of the art of machine learning",
#  "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
#  "Build and/or run a machine learning service that operationally improves your product or workflows",NA,"Other"))
        
 #   current_job_JobFunction <- current_job_JobFunction[1:8,]
 #   current_job_JobFunction$JobFunctionSelect<- factor( current_job_JobFunction$JobFunctionSelect, levels= #current_job_JobFunction$JobFunctionSelect)
   # plot_4 <-ggplot(current_job_JobFunction, aes(x = JobFunctionSelect,y=percent, fill = JobFunctionSelect)) + 
#        geom_bar(stat="identity")+
#        theme(legend.position="none")
       
       # theme(axis.text.x=element_text(angle=20,hjust=1))      
    




# unique(exp_df$JobFunctionSelect)
# orig_names <- c("Build prototypes to explore applying machine learning to new areas",                                
# "Analyze and understand data to influence product or business decisions",                                            
#   "Research that advances the state of the art of machine learning",                                                   
#   "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
#   "Build and/or run a machine learning service that operationally improves your product or workflows",                 
#   NA,                                                                                                                  
#  "Other)
# 
# new_names <- 
# 
# 

```