library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(knitr)
library(kableExtra)
library(rlang)

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, %||%, as_function, flatten, flatten_chr, flatten_dbl,
##     flatten_int, flatten_lgl, invoke, list_along, modify, prepend,
##     rep_along, splice

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(stringr)

Functions

I wanted to avoid using the two functions directly, but it made life really easy specifically Choosemultiple
I can’t contact her on kaggle becuase I am not a contributor, if someone is can they can send her a message?
kaggel contact info

## Takes a column and creates summary without nulls containing N sum and proportions
chooseOne = function(question){
    exp_df %>%
        filter(!UQ(sym(question)) == "") %>% 
        dplyr::group_by_(question) %>% 
    # Count how many respondents selected each option
        dplyr::summarise(count = n()) %>% 
    # Calculate what percent of respondents selected each option
        dplyr::mutate(percent = (count / sum(count)) * 100) %>% 
    # Arrange the counts in descending order
        dplyr::arrange(desc(count)) 
}

## Same as Choose one except works for answers where user could choose multiple answers per question
chooseMultiple = function(question,df){

  df %>% 
    # Remove any rows where the respondent didn't answer the question
    dplyr::filter(!UQ(sym(question)) == "") %>%
    # Remove all columns except question
    dplyr::select(question) %>% 
    # Add a column with the initial number of respondents to question
    dplyr::mutate(totalCount = n()) %>% 
    # Split multiple answers apart at the comma, but ignore commas inside parentheses
    dplyr::mutate(selections = strsplit(as.character(UQ(sym(question))), 
                                 '\\([^)]+,(*SKIP)(*FAIL)|,\\s*', perl = TRUE)) %>%
    # Split answers are now nested, need to unnest them
    unnest(selections) %>% 
    # Group by the selected responses to the question
    dplyr::group_by(selections) %>% 
   # Count how many respondents selected each option
    dplyr::summarise(totalCount = max(totalCount),
              count = n()) %>% 
    # Calculate what percent of respondents selected each option
    dplyr::mutate(percent = (count / totalCount) * 100) %>% 
    # Arrange the counts in descending order
    dplyr::arrange(desc(count))
}        


## Slight modification to choosone,
Academic_exploration=function(question,df){
     df %>%
        filter(!UQ(sym(question)) == "") %>% 
        dplyr::group_by_(question) %>% 
    # Count how many respondents selected each option
        dplyr::summarise(count = n()) %>% 
    # Calculate what percent of respondents selected each option
        dplyr::mutate(percent = (count / sum(count)) * 100) %>% 
    # Arrange the counts in descending order
        dplyr::arrange(desc(count)) 
}



## Takes a vector and creates a percantage column
proportion_function <- function(vec){
    vec/sum(vec)*100
}

##Creates intervals for column data, that matches intervals to foreign data intervals 

create_breaks <- function(dfcolumn,breaks,labels)
    {
    dfcolumn <- as.numeric(dfcolumn)
    dfcolumn <- cut(dfcolumn,breaks=breaks,labels=labels,right=FALSE)
    }


##Failed funnction
#identity_plots_45 <- function(df,x,y,fill){ggplot(df, aes(x = var(x),y=var(y), fill = var(fill))) + 
 #     geom_bar(stat="identity")+
#}

Load in data and create summaries for all the columns with functional

## Warning: Missing column names filled in: 'X229' [229], 'X230' [230]

## Parsed with column specification:
## cols(
##   .default = col_character()
## )

## See spec(...) for full column specifications.

## # A tibble: 2,895 x 2
##    TimeSpentStudying ProveKnowledgeSelect
##    <chr>             <chr>               
##  1 <NA>              <NA>                
##  2 <NA>              <NA>                
##  3 <NA>              <NA>                
##  4 <NA>              <NA>                
##  5 <NA>              <NA>                
##  6 <NA>              <NA>                
##  7 <NA>              <NA>                
##  8 <NA>              <NA>                
##  9 <NA>              <NA>                
## 10 <NA>              <NA>                
## # ... with 2,885 more rows

How did you first learn machine learning

## # A tibble: 6 x 3
##   FirstTrainingSelect                         count percent
##   <chr>                                       <int>   <dbl>
## 1 University courses                            975   34.4 
## 2 Self-taught                                   777   27.4 
## 3 Online courses (coursera, udemy, edx, etc.)   756   26.7 
## 4 Work                                          250    8.82
## 5 Kaggle competitions                            44    1.55
## 6 Other                                          33    1.16

Takeaway

Only 8% of data scientists went into their job with no knowledge of Machine Learning
Over 50% of data scientists learned by themselves or through online courses
Over 34% of data scientists first learned ML through university

How important is university?

Takeaways

less than 10% of data scientists feel Advanced degrees are unimportant

Education data

Below Image taken from Burtchwood Study on Data Scientists

Burtchwood Study on Data Scientists

Takeaways

over 70% of data scientists have at least a masters degree!!

Display by Major

Takeaways

Computer Science leads the field with over 30%
Close behind is math and stat followed by Electrical engineering
About 6.5% of data scientists come from social science backgrounds

What are the Gender and Age demographics

## # A tibble: 4 x 2
##   GenderSelect percent
##   <chr>          <dbl>
## 1 Male          84.0  
## 2 Female        14.4  
## 3 other          1.14 
## 4 Non-binary     0.485

For comparison Burthwoods gender

Burtchwood Study on Data Scientists

“By maintaining such strong relationships with candidates and clients, Burtch Works has the unique opportunity to examine hiring and compensation trends over time, and publishes several highly-anticipated studies each year that investigate demographic and compensation data for predictive analytics, marketing research, and data science professionals. The Burtch Works Studies provide an exceptional vantage point on compensation for these professionals across the country, and contain critical information both for individuals mapping their career strategy, and for hiring managers hoping to recruit and retain outstanding personnel to their teams.”

## # A tibble: 4 x 3
##   GenderSelect                                      count percent
##   <chr>                                             <int>   <dbl>
## 1 Male                                               2422  84.0  
## 2 Female                                              416  14.4  
## 3 A different identity                                 33   1.14 
## 4 Non-binary, genderqueer, or gender non-conforming    14   0.485

## Warning in create_breaks(exp_df$Age, c(1, 22.1, 28.1, 35.1, 41.1, 49.1, :
## NAs introduced by coercion

Takeaways

Only about 18% of data scientists are over 47 years old
There are as many 18-22 year old as there are 62+
23-38 encompasses nearly 60% of the field
There is an enormous gender bias in the composition of Males/Females with males outnumbering females at nearly 4-1

Explore how long Employees have been in the data science field

## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

## # A tibble: 3 x 3
##   burtchworks_tenure percent_Burtch_works tenure       
##                <dbl>                <dbl> <fctr>       
## 1                150                 38.0 0-5          
## 2                120                 30.4 6-10         
## 3                125                 31.6 " 10 + Years"

Comparing the datasets

Our dataset seems to have less experience than the BurtchWorks
We should note that observations may be somewhat influenced by a younger grouping of Data scientists

Explore Employment Status and see how it effects remote login statistics

## # A tibble: 4 x 3
##   EmploymentStatus                                     count percent
##   <chr>                                                <int>   <dbl>
## 1 Employed full-time                                    2348  81.1  
## 2 Independent contractor, freelancer, or self-employed   332  11.5  
## 3 Employed part-time                                     187   6.46 
## 4 Retired                                                 28   0.967

Takeaways

Unsurprisingly people working independently seem to work remotely w over 50% of those employed this way always working remotely
Full time employees don’t work remotely all that much. Over 35% of them Rarely work remotely, and very few almost always work remotely
Part time employees seem pretty evenly split

Employer catagories

## [[1]]
## # A tibble: 7 x 3
##   FormalEducation                                             count perce~
##   <chr>                                                       <int>  <dbl>
## 1 Master's degree                                              1280 44.5  
## 2 Doctoral degree                                               765 26.6  
## 3 Bachelor's degree                                             627 21.8  
## 4 Professional degree                                           100  3.47 
## 5 Some college/university study without earning a bachelor's~    76  2.64 
## 6 I did not complete any formal education past high school       22  0.764
## 7 I prefer not to answer                                          9  0.313
## 
## [[2]]
## # A tibble: 15 x 3
##    MajorSelect                                                count perce~
##    <chr>                                                      <int>  <dbl>
##  1 Computer Science                                             836 32.2  
##  2 Mathematics or statistics                                    500 19.3  
##  3 Electrical Engineering                                       264 10.2  
##  4 Engineering (non-computer focused)                           226  8.71 
##  5 Physics                                                      181  6.98 
##  6 Other                                                        138  5.32 
##  7 Information technology, networking, or system administrat~   119  4.59 
##  8 A social science                                             104  4.01 
##  9 Biology                                                       66  2.54 
## 10 Management information systems                                44  1.70 
## 11 Psychology                                                    38  1.46 
## 12 A health science                                              37  1.43 
## 13 A humanities discipline                                       24  0.925
## 14 Fine arts or performing arts                                  11  0.424
## 15 I never declared a major                                       6  0.231
## 
## [[3]]
## # A tibble: 6 x 3
##   Tenure                             count percent
##   <chr>                              <int>   <dbl>
## 1 3 to 5 years                         895  31.6  
## 2 More than 10 years                   652  23.0  
## 3 1 to 2 years                         590  20.8  
## 4 6 to 10 years                        515  18.2  
## 5 Less than a year                     173   6.11 
## 6 I don't write code to analyze data     8   0.282
## 
## [[4]]
## # A tibble: 6 x 3
##   FirstTrainingSelect                         count percent
##   <chr>                                       <int>   <dbl>
## 1 University courses                            975   34.4 
## 2 Self-taught                                   777   27.4 
## 3 Online courses (coursera, udemy, edx, etc.)   756   26.7 
## 4 Work                                          250    8.82
## 5 Kaggle competitions                            44    1.55
## 6 Other                                          33    1.16
## 
## [[5]]
## # A tibble: 25 x 3
##    EmployerIndustry   count percent
##    <chr>              <int>   <dbl>
##  1 Academic             638   23.8 
##  2 Technology           485   18.1 
##  3 Financial            243    9.07
##  4 Other                216    8.07
##  5 Mix of fields        206    7.69
##  6 Internet-based       158    5.90
##  7 Government           155    5.79
##  8 Manufacturing         93    3.47
##  9 CRM/Marketing         84    3.14
## 10 Telecommunications    67    2.50
## # ... with 15 more rows
## 
## [[6]]
## # A tibble: 27 x 3
##    EmployerSize             count percent
##    <chr>                    <int>   <dbl>
##  1 10,000 or more employees   454   19.9 
##  2 100 to 499 employees       379   16.6 
##  3 1,000 to 4,999 employees   345   15.1 
##  4 20 to 99 employees         340   14.9 
##  5 500 to 999 employees       171    7.48
##  6 10 to 19 employees         148    6.47
##  7 5,000 to 9,999 employees   146    6.38
##  8 Fewer than 10 employees    134    5.86
##  9 I don't know                94    4.11
## 10 I prefer not to answer      24    1.05
## # ... with 17 more rows
## 
## [[7]]
## # A tibble: 22 x 3
##    EmployerSizeChange       count percent
##    <chr>                    <int>   <dbl>
##  1 Increased slightly         828  37.1  
##  2 Stayed the same            710  31.8  
##  3 Increased significantly    453  20.3  
##  4 Decreased slightly         135   6.05 
##  5 Decreased significantly     58   2.60 
##  6 1,000 to 4,999 employees     7   0.314
##  7 Fewer than 10 employees      7   0.314
##  8 500 to 999 employees         5   0.224
##  9 10,000 or more employees     4   0.179
## 10 20 to 99 employees           4   0.179
## # ... with 12 more rows

Use top employment industries (Academic Financial Government Technology)

Create function to map a grid of our subsetted data

### Someone get me a function here to create factor of 1st column!!!
### Also one for ggplot if possible?




## Explore subset but first create function 
explore_data_science <- function(field,academic_indus){
Academic_Size <- Academic_exploration("EmployerSize",Academic_indus)
Academic_Size$EmployerSize<- factor(Academic_Size$EmployerSize, levels=Academic_Size$EmployerSize)

Academic_SizeChange <- Academic_exploration("EmployerSizeChange",Academic_indus)
Academic_SizeChange$EmployerSizeChange<- factor(Academic_SizeChange$EmployerSizeChange, levels=Academic_SizeChange$EmployerSizeChange)

Academic_MLTime <- Academic_exploration("EmployerMLTime",Academic_indus)
Academic_MLTime$EmployerMLTime<- factor(Academic_MLTime$EmployerMLTime, levels=Academic_MLTime$EmployerMLTime)

Academic_SearchMethod <- Academic_exploration("EmployerSearchMethod",Academic_indus)
Academic_SearchMethod[,1]= c("friend/Family", "Internal Recruiter", "Other Way", "Thru Website", "General Job Board", "Career Fair", "Tech Job board", "Headhunter")
Academic_SearchMethod$EmployerSearchMethod<- factor(Academic_SearchMethod$EmployerSearchMethod, levels=Academic_SearchMethod$EmployerSearchMethod)

plot_1 <- ggplot(Academic_Size, aes(x = EmployerSize,y=percent, fill = EmployerSize)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_2 <- ggplot(Academic_SizeChange, aes(x = EmployerSizeChange,y=percent, fill = EmployerSizeChange)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_3 <- ggplot(Academic_MLTime, aes(x = EmployerMLTime,y=percent, fill = EmployerMLTime)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
    coord_flip()
plot_4 <- ggplot(Academic_SearchMethod, aes(x = EmployerSearchMethod,y=percent, fill = EmployerSearchMethod)) + 
      geom_bar(stat="identity")+
      theme(legend.position="none")+
      coord_flip()

   
 grid.arrange(plot_1,plot_2,plot_3,plot_4, top=paste("Data scientists employed in",field))

 
}

Now we can Explore all the Major Data science employment fields

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Academic"))
explore_data_science("Academic",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Technology"))
explore_data_science("Technology",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Financial"))
explore_data_science("Financial",Academic_indus)

Academic_indus <- exp_df %>%
    filter(EmployerIndustry%in%c("Government"))
explore_data_science("Government",Academic_indus)

Create function to look at current job trends by current job titles

current_jobs <- c("CurrentJobTitleSelect", "TitleFit", "PastJobTitlesSelect", "CurrentEmployerType")
##View categories
lapply(current_jobs,function(x)chooseOne(x))

## [[1]]
## # A tibble: 15 x 3
##    CurrentJobTitleSelect                count percent
##    <chr>                                <int>   <dbl>
##  1 Scientist/Researcher                   485  16.8  
##  2 Data Analyst                           417  14.4  
##  3 Machine Learning Engineer              345  11.9  
##  4 Software Developer/Software Engineer   313  10.8  
##  5 Other                                  247   8.54 
##  6 Researcher                             234   8.09 
##  7 Business Analyst                       182   6.30 
##  8 Computer Scientist                     159   5.50 
##  9 Statistician                           137   4.74 
## 10 Engineer                               118   4.08 
## 11 Predictive Modeler                      81   2.80 
## 12 Programmer                              61   2.11 
## 13 Data Miner                              42   1.45 
## 14 DBA/Database Engineer                   42   1.45 
## 15 Operations Research Practitioner        28   0.969
## 
## [[2]]
## # A tibble: 3 x 3
##   TitleFit  count percent
##   <chr>     <int>   <dbl>
## 1 Fine       1812    65.0
## 2 Perfectly   568    20.4
## 3 Poorly      407    14.6
## 
## [[3]]
## # A tibble: 874 x 3
##    PastJobTitlesSelect                             count percent
##    <chr>                                           <int>   <dbl>
##  1 Researcher                                        218    8.02
##  2 Software Developer/Software Engineer              118    4.34
##  3 Other                                             111    4.09
##  4 Data Analyst                                       90    3.31
##  5 Engineer                                           73    2.69
##  6 I haven't started working yet                      57    2.10
##  7 Programmer                                         55    2.02
##  8 Business Analyst                                   51    1.88
##  9 Programmer,Software Developer/Software Engineer    38    1.40
## 10 Data Scientist                                     36    1.32
## # ... with 864 more rows
## 
## [[4]]
## # A tibble: 60 x 3
##    CurrentEmployerType                                         count perc~
##    <chr>                                                       <int> <dbl>
##  1 Employed by college or university                             663 23.4 
##  2 Employed by a company that performs advanced analytics        485 17.1 
##  3 Employed by professional services/consulting firm             459 16.2 
##  4 Employed by a company that doesn't perform advanced analyt~   303 10.7 
##  5 Employed by company that makes advanced analytic software     264  9.32
##  6 Self-employed                                                 189  6.67
##  7 Employed by government                                        159  5.61
##  8 Employed by non-profit or NGO                                  56  1.98
##  9 Employed by company that makes advanced analytic software,~    46  1.62
## 10 Employed by professional services/consulting firm,Employed~    37  1.31
## # ... with 50 more rows

## Create function to explore =("TitleFit","PastJobTitlesSelect","CurrentEmployerType") for current types of job positions

explore_current_job<- function(current_job,field){
#Load in df and column choice
    #Academic_Size <- Academic_exploration(field,current_job)

#create data for ("TitleFit","PastJobTitlesSelect","CurrentEmployerType")  
#"TitleFit
    current_job_TitleFit <- Academic_exploration("TitleFit",current_job)
    current_job_TitleFit$TitleFit<- factor( current_job_TitleFit$TitleFit, levels= current_job_TitleFit$TitleFit)
#"PastJobTitlesSelect 
    current_job_PastJob <- chooseMultiple("PastJobTitlesSelect",current_job)
    current_job_PastJob <- current_job_PastJob[1:7,]
    current_job_PastJob$selections<- factor( current_job_PastJob$selections, levels= current_job_PastJob$selections)
#"CurrentEmployerType"    
    current_job_CurrentEmployer<- chooseMultiple("CurrentEmployerType",current_job)
    #current_job_CurrentEmployer <- current_job_CurrentEmployer[1:8,]
    current_job_CurrentEmployer$selections<- factor( current_job_CurrentEmployer$selections, levels= current_job_CurrentEmployer$selections)

       
#Plots    

    plot_1 <- ggplot(current_job_TitleFit, aes(x = TitleFit,y=percent, fill = TitleFit)) + 
        geom_bar(stat="identity")+
        theme(legend.position="none")+
        xlab("Job Fit")
    
    plot_2 <-ggplot(current_job_PastJob, aes(x = selections,y=percent, fill = selections)) + 
        geom_bar(stat="identity")+
        theme(legend.position="none")+
        xlab("Past Job Title")+
        coord_flip()
    
    plot_3 <-ggplot(current_job_CurrentEmployer, aes(x = selections,y=percent, fill = selections)) + 
        geom_bar(stat="identity")+
        theme(legend.position="none")+
        xlab("Current Employer")+
        coord_flip()
    grid.arrange(plot_1,plot_2,plot_3,top=paste("Data scientists with current job title",field))
}

Look at current job trends by current job titles

current_job <- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Scientist/Researcher"))
explore_current_job(current_job,"Scientist/Researcher")

current_job <- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Data Analyst"))
explore_current_job(current_job,"Data Analyst")

current_job<- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Machine Learning Engineer"))
explore_current_job(current_job,"Machine Learning Engineer")

current_job<- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Software Developer/Software Engineer"))
explore_current_job(current_job,"Software Developer/Software Engineer")

current_job <- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Business Analyst"))
explore_current_job(current_job,"Business Analyst")

current_job <- exp_df %>%
    filter(CurrentJobTitleSelect%in%c("Statistician"))
explore_current_job(current_job,"Statistician")

CompensationAmount

US_only_df <- exp_df %>% 
    filter(Country%in%c('United States'))
    
str(US_only_df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    734 obs. of  24 variables:
##  $ id                   : int  2 3 4 9 10 11 12 17 19 20 ...
##  $ GenderSelect         : chr  "Male" "Male" "Male" "Male" ...
##  $ Country              : chr  "United States" "United States" "United States" "United States" ...
##  $ Age                  : chr  "56" "58" "25" "43" ...
##  $ EmploymentStatus     : chr  "Independent contractor, freelancer, or self-employed" "Independent contractor, freelancer, or self-employed" "Employed part-time" "Retired" ...
##  $ CurrentJobTitleSelect: chr  "Operations Research Practitioner" "DBA/Database Engineer" "Researcher" "Software Developer/Software Engineer" ...
##  $ TitleFit             : chr  "Poorly" "Poorly" "Fine" "Perfectly" ...
##  $ PastJobTitlesSelect  : chr  "Business Analyst,Operations Research Practitioner,Predictive Modeler,Programmer,Other" "Data Analyst,DBA/Database Engineer,Programmer,Researcher,Software Developer/Software Engineer" NA "Computer Scientist,Programmer,Researcher,Software Developer/Software Engineer" ...
##  $ CurrentEmployerType  : chr  "Self-employed" "Employed by professional services/consulting firm" "Employed by college or university" "Employed by a company that performs advanced analytics" ...
##  $ FormalEducation      : chr  "Master's degree" "Master's degree" "Bachelor's degree" "Doctoral degree" ...
##  $ MajorSelect          : chr  "Mathematics or statistics" "Mathematics or statistics" "Physics" "Computer Science" ...
##  $ Tenure               : chr  "More than 10 years" "More than 10 years" "3 to 5 years" "More than 10 years" ...
##  $ FirstTrainingSelect  : chr  "University courses" "Online courses (coursera, udemy, edx, etc.)" "University courses" "Self-taught" ...
##  $ EmployerIndustry     : chr  "Mix of fields" "Technology" "Academic" "Financial" ...
##  $ EmployerSize         : chr  NA NA "I don't know" NA ...
##  $ EmployerSizeChange   : chr  NA NA "Increased significantly" NA ...
##  $ EmployerMLTime       : chr  NA NA "Don't know" NA ...
##  $ EmployerSearchMethod : chr  NA NA "Some other way" NA ...
##  $ UniversityImportance : chr  "Very important" "Very important" "Important" NA ...
##  $ JobFunctionSelect    : chr  "Analyze and understand data to influence product or business decisions" "Analyze and understand data to influence product or business decisions" "Research that advances the state of the art of machine learning" NA ...
##  $ RemoteWork           : chr  NA "Most of the time" "Sometimes" NA ...
##  $ CompensationAmount   : chr  "250000" "120000" "20000" NA ...
##  $ CompensationCurrency : chr  "USD" NA "USD" NA ...
##  $ age_groups           : Factor w/ 7 levels "18-22","23-30",..: 6 7 2 5 2 7 3 4 3 NA ...

US_only_df$CompensationAmount <- str_replace_all(US_only_df$CompensationAmount,"\\D+","") 

us_money <- Academic_exploration("CompensationAmount",US_only_df)
us_money

## # A tibble: 121 x 3
##    CompensationAmount count percent
##    <chr>              <int>   <dbl>
##  1 100000                15    4.60
##  2 140000                15    4.60
##  3 150000                14    4.29
##  4 80000                 11    3.37
##  5 90000                 11    3.37
##  6 120000                10    3.07
##  7 50000                 10    3.07
##  8 65000                 10    3.07
##  9 125000                 9    2.76
## 10 70000                  9    2.76
## # ... with 111 more rows

my_dat <- create_breaks(us_money$CompensationAmount,breaks=c(0,30000,70000,11000,150000,Inf),labels=c('<30k','30-70k',"70-110k","110-150k","150k+"))
us_money$groups <- my_dat


ggplot(us_money, aes(x = groups,y=percent, fill = groups)) + 
        geom_bar(stat="identity")+
        theme(legend.position="none")+
        xlab("Current Employer")+
        scale_x_discrete("groups", limits=c('<30k','30-70k',"70-110k","110-150k","150k+"))

Likely useless commented out data that i misinterpreted

##Gender
# gender_1 <- c(males=163,females=40)
# gender_2 <- c(males=152,females=51)
# proportion_function(gender_1)
# proportion_function(gender_2)


# ##AGE
# proportion_function(average_age_1) 
# proportion_function(average_age_2)
# average_age_1 <-c("0-18"=54,111,145,129,92,75,55)
# average_age_2 <-c("0-18"=58,106,137,136,92,76,56)

##Education

# education_1 <- c(45,104,260)
# education_2 <- c(46,110,240)
# education_1 <- c("no_college"=45, "college"=104, "grad school"=260)
# education_2 <- c("no_college"=46, "college"=110, "grad school"=240)
# 
##Salary
# 92.798
# salary_1 <- c(65,110,166,179)
# salary_2 <- c(62,111,170,192)


# identity_plots_45(majors,x='MajorSelect',y='percent',fill='MajorSelect')



##4th graph job fucntions coudlnt get it in
    
#Jobfunctionsselct
#    current_job_JobFunction<- Academic_exploration("JobFunctionSelect",current_job)
#    current_job_JobFunction <- current_job_JobFunction %>% 
 #       filter(JobFunctionSelect%in%c("Build prototypes to explore applying machine learning to new areas","Analyze and understand data to #influence product or business decisions","Research that advances the state of the art of machine learning",
#  "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
#  "Build and/or run a machine learning service that operationally improves your product or workflows",NA,"Other"))
        
 #   current_job_JobFunction <- current_job_JobFunction[1:8,]
 #   current_job_JobFunction$JobFunctionSelect<- factor( current_job_JobFunction$JobFunctionSelect, levels= #current_job_JobFunction$JobFunctionSelect)
   # plot_4 <-ggplot(current_job_JobFunction, aes(x = JobFunctionSelect,y=percent, fill = JobFunctionSelect)) + 
#        geom_bar(stat="identity")+
#        theme(legend.position="none")
       
       # theme(axis.text.x=element_text(angle=20,hjust=1))      
    




# unique(exp_df$JobFunctionSelect)
# orig_names <- c("Build prototypes to explore applying machine learning to new areas",                                
# "Analyze and understand data to influence product or business decisions",                                            
#   "Research that advances the state of the art of machine learning",                                                   
#   "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
#   "Build and/or run a machine learning service that operationally improves your product or workflows",                 
#   NA,                                                                                                                  
#  "Other)
# 
# new_names <- 
# 
#

```

Justin_herman

Justin Herman

March 16, 2018

Functions

Load in data and create summaries for all the columns with functional

How did you first learn machine learning

Takeaway

How important is university?

Takeaways

Education data

Below Image taken from Burtchwood Study on Data Scientists

Takeaways

Display by Major

Takeaways

What are the Gender and Age demographics

For comparison Burthwoods gender

Takeaways

Explore how long Employees have been in the data science field

Comparing the datasets

Takeaways

Employer catagories

Use top employment industries (Academic Financial Government Technology)

Create function to map a grid of our subsetted data

Now we can Explore all the Major Data science employment fields

Create function to look at current job trends by current job titles

Look at current job trends by current job titles

CompensationAmount

Likely useless commented out data that i misinterpreted