library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(kableExtra)
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, %||%, as_function, flatten, flatten_chr, flatten_dbl,
## flatten_int, flatten_lgl, invoke, list_along, modify, prepend,
## rep_along, splice
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(stringr)
## Takes a column and creates summary without nulls containing N sum and proportions
chooseOne = function(question){
exp_df %>%
filter(!UQ(sym(question)) == "") %>%
dplyr::group_by_(question) %>%
# Count how many respondents selected each option
dplyr::summarise(count = n()) %>%
# Calculate what percent of respondents selected each option
dplyr::mutate(percent = (count / sum(count)) * 100) %>%
# Arrange the counts in descending order
dplyr::arrange(desc(count))
}
## Same as Choose one except works for answers where user could choose multiple answers per question
chooseMultiple = function(question,df){
df %>%
# Remove any rows where the respondent didn't answer the question
dplyr::filter(!UQ(sym(question)) == "") %>%
# Remove all columns except question
dplyr::select(question) %>%
# Add a column with the initial number of respondents to question
dplyr::mutate(totalCount = n()) %>%
# Split multiple answers apart at the comma, but ignore commas inside parentheses
dplyr::mutate(selections = strsplit(as.character(UQ(sym(question))),
'\\([^)]+,(*SKIP)(*FAIL)|,\\s*', perl = TRUE)) %>%
# Split answers are now nested, need to unnest them
unnest(selections) %>%
# Group by the selected responses to the question
dplyr::group_by(selections) %>%
# Count how many respondents selected each option
dplyr::summarise(totalCount = max(totalCount),
count = n()) %>%
# Calculate what percent of respondents selected each option
dplyr::mutate(percent = (count / totalCount) * 100) %>%
# Arrange the counts in descending order
dplyr::arrange(desc(count))
}
## Slight modification to choosone,
Academic_exploration=function(question,df){
df %>%
filter(!UQ(sym(question)) == "") %>%
dplyr::group_by_(question) %>%
# Count how many respondents selected each option
dplyr::summarise(count = n()) %>%
# Calculate what percent of respondents selected each option
dplyr::mutate(percent = (count / sum(count)) * 100) %>%
# Arrange the counts in descending order
dplyr::arrange(desc(count))
}
## Takes a vector and creates a percantage column
proportion_function <- function(vec){
vec/sum(vec)*100
}
##Creates intervals for column data, that matches intervals to foreign data intervals
create_breaks <- function(dfcolumn,breaks,labels)
{
dfcolumn <- as.numeric(dfcolumn)
dfcolumn <- cut(dfcolumn,breaks=breaks,labels=labels,right=FALSE)
}
##Failed funnction
#identity_plots_45 <- function(df,x,y,fill){ggplot(df, aes(x = var(x),y=var(y), fill = var(fill))) +
# geom_bar(stat="identity")+
#}
## Warning: Missing column names filled in: 'X229' [229], 'X230' [230]
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## # A tibble: 2,895 x 2
## TimeSpentStudying ProveKnowledgeSelect
## <chr> <chr>
## 1 <NA> <NA>
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> <NA>
## 6 <NA> <NA>
## 7 <NA> <NA>
## 8 <NA> <NA>
## 9 <NA> <NA>
## 10 <NA> <NA>
## # ... with 2,885 more rows
## # A tibble: 6 x 3
## FirstTrainingSelect count percent
## <chr> <int> <dbl>
## 1 University courses 975 34.4
## 2 Self-taught 777 27.4
## 3 Online courses (coursera, udemy, edx, etc.) 756 26.7
## 4 Work 250 8.82
## 5 Kaggle competitions 44 1.55
## 6 Other 33 1.16
Burtchwood Study on Data Scientists
Computer Science leads the field with over 30%
Close behind is math and stat followed by Electrical engineering
About 6.5% of data scientists come from social science backgrounds
## # A tibble: 4 x 2
## GenderSelect percent
## <chr> <dbl>
## 1 Male 84.0
## 2 Female 14.4
## 3 other 1.14
## 4 Non-binary 0.485
Burtchwood Study on Data Scientists
## # A tibble: 4 x 3
## GenderSelect count percent
## <chr> <int> <dbl>
## 1 Male 2422 84.0
## 2 Female 416 14.4
## 3 A different identity 33 1.14
## 4 Non-binary, genderqueer, or gender non-conforming 14 0.485
## Warning in create_breaks(exp_df$Age, c(1, 22.1, 28.1, 35.1, 41.1, 49.1, :
## NAs introduced by coercion
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
## # A tibble: 3 x 3
## burtchworks_tenure percent_Burtch_works tenure
## <dbl> <dbl> <fctr>
## 1 150 38.0 0-5
## 2 120 30.4 6-10
## 3 125 31.6 " 10 + Years"
## # A tibble: 4 x 3
## EmploymentStatus count percent
## <chr> <int> <dbl>
## 1 Employed full-time 2348 81.1
## 2 Independent contractor, freelancer, or self-employed 332 11.5
## 3 Employed part-time 187 6.46
## 4 Retired 28 0.967
## [[1]]
## # A tibble: 7 x 3
## FormalEducation count perce~
## <chr> <int> <dbl>
## 1 Master's degree 1280 44.5
## 2 Doctoral degree 765 26.6
## 3 Bachelor's degree 627 21.8
## 4 Professional degree 100 3.47
## 5 Some college/university study without earning a bachelor's~ 76 2.64
## 6 I did not complete any formal education past high school 22 0.764
## 7 I prefer not to answer 9 0.313
##
## [[2]]
## # A tibble: 15 x 3
## MajorSelect count perce~
## <chr> <int> <dbl>
## 1 Computer Science 836 32.2
## 2 Mathematics or statistics 500 19.3
## 3 Electrical Engineering 264 10.2
## 4 Engineering (non-computer focused) 226 8.71
## 5 Physics 181 6.98
## 6 Other 138 5.32
## 7 Information technology, networking, or system administrat~ 119 4.59
## 8 A social science 104 4.01
## 9 Biology 66 2.54
## 10 Management information systems 44 1.70
## 11 Psychology 38 1.46
## 12 A health science 37 1.43
## 13 A humanities discipline 24 0.925
## 14 Fine arts or performing arts 11 0.424
## 15 I never declared a major 6 0.231
##
## [[3]]
## # A tibble: 6 x 3
## Tenure count percent
## <chr> <int> <dbl>
## 1 3 to 5 years 895 31.6
## 2 More than 10 years 652 23.0
## 3 1 to 2 years 590 20.8
## 4 6 to 10 years 515 18.2
## 5 Less than a year 173 6.11
## 6 I don't write code to analyze data 8 0.282
##
## [[4]]
## # A tibble: 6 x 3
## FirstTrainingSelect count percent
## <chr> <int> <dbl>
## 1 University courses 975 34.4
## 2 Self-taught 777 27.4
## 3 Online courses (coursera, udemy, edx, etc.) 756 26.7
## 4 Work 250 8.82
## 5 Kaggle competitions 44 1.55
## 6 Other 33 1.16
##
## [[5]]
## # A tibble: 25 x 3
## EmployerIndustry count percent
## <chr> <int> <dbl>
## 1 Academic 638 23.8
## 2 Technology 485 18.1
## 3 Financial 243 9.07
## 4 Other 216 8.07
## 5 Mix of fields 206 7.69
## 6 Internet-based 158 5.90
## 7 Government 155 5.79
## 8 Manufacturing 93 3.47
## 9 CRM/Marketing 84 3.14
## 10 Telecommunications 67 2.50
## # ... with 15 more rows
##
## [[6]]
## # A tibble: 27 x 3
## EmployerSize count percent
## <chr> <int> <dbl>
## 1 10,000 or more employees 454 19.9
## 2 100 to 499 employees 379 16.6
## 3 1,000 to 4,999 employees 345 15.1
## 4 20 to 99 employees 340 14.9
## 5 500 to 999 employees 171 7.48
## 6 10 to 19 employees 148 6.47
## 7 5,000 to 9,999 employees 146 6.38
## 8 Fewer than 10 employees 134 5.86
## 9 I don't know 94 4.11
## 10 I prefer not to answer 24 1.05
## # ... with 17 more rows
##
## [[7]]
## # A tibble: 22 x 3
## EmployerSizeChange count percent
## <chr> <int> <dbl>
## 1 Increased slightly 828 37.1
## 2 Stayed the same 710 31.8
## 3 Increased significantly 453 20.3
## 4 Decreased slightly 135 6.05
## 5 Decreased significantly 58 2.60
## 6 1,000 to 4,999 employees 7 0.314
## 7 Fewer than 10 employees 7 0.314
## 8 500 to 999 employees 5 0.224
## 9 10,000 or more employees 4 0.179
## 10 20 to 99 employees 4 0.179
## # ... with 12 more rows
### Someone get me a function here to create factor of 1st column!!!
### Also one for ggplot if possible?
## Explore subset but first create function
explore_data_science <- function(field,academic_indus){
Academic_Size <- Academic_exploration("EmployerSize",Academic_indus)
Academic_Size$EmployerSize<- factor(Academic_Size$EmployerSize, levels=Academic_Size$EmployerSize)
Academic_SizeChange <- Academic_exploration("EmployerSizeChange",Academic_indus)
Academic_SizeChange$EmployerSizeChange<- factor(Academic_SizeChange$EmployerSizeChange, levels=Academic_SizeChange$EmployerSizeChange)
Academic_MLTime <- Academic_exploration("EmployerMLTime",Academic_indus)
Academic_MLTime$EmployerMLTime<- factor(Academic_MLTime$EmployerMLTime, levels=Academic_MLTime$EmployerMLTime)
Academic_SearchMethod <- Academic_exploration("EmployerSearchMethod",Academic_indus)
Academic_SearchMethod[,1]= c("friend/Family", "Internal Recruiter", "Other Way", "Thru Website", "General Job Board", "Career Fair", "Tech Job board", "Headhunter")
Academic_SearchMethod$EmployerSearchMethod<- factor(Academic_SearchMethod$EmployerSearchMethod, levels=Academic_SearchMethod$EmployerSearchMethod)
plot_1 <- ggplot(Academic_Size, aes(x = EmployerSize,y=percent, fill = EmployerSize)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
coord_flip()
plot_2 <- ggplot(Academic_SizeChange, aes(x = EmployerSizeChange,y=percent, fill = EmployerSizeChange)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
coord_flip()
plot_3 <- ggplot(Academic_MLTime, aes(x = EmployerMLTime,y=percent, fill = EmployerMLTime)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
coord_flip()
plot_4 <- ggplot(Academic_SearchMethod, aes(x = EmployerSearchMethod,y=percent, fill = EmployerSearchMethod)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
coord_flip()
grid.arrange(plot_1,plot_2,plot_3,plot_4, top=paste("Data scientists employed in",field))
}
Academic_indus <- exp_df %>%
filter(EmployerIndustry%in%c("Academic"))
explore_data_science("Academic",Academic_indus)
Academic_indus <- exp_df %>%
filter(EmployerIndustry%in%c("Technology"))
explore_data_science("Technology",Academic_indus)
Academic_indus <- exp_df %>%
filter(EmployerIndustry%in%c("Financial"))
explore_data_science("Financial",Academic_indus)
Academic_indus <- exp_df %>%
filter(EmployerIndustry%in%c("Government"))
explore_data_science("Government",Academic_indus)
current_jobs <- c("CurrentJobTitleSelect", "TitleFit", "PastJobTitlesSelect", "CurrentEmployerType")
##View categories
lapply(current_jobs,function(x)chooseOne(x))
## [[1]]
## # A tibble: 15 x 3
## CurrentJobTitleSelect count percent
## <chr> <int> <dbl>
## 1 Scientist/Researcher 485 16.8
## 2 Data Analyst 417 14.4
## 3 Machine Learning Engineer 345 11.9
## 4 Software Developer/Software Engineer 313 10.8
## 5 Other 247 8.54
## 6 Researcher 234 8.09
## 7 Business Analyst 182 6.30
## 8 Computer Scientist 159 5.50
## 9 Statistician 137 4.74
## 10 Engineer 118 4.08
## 11 Predictive Modeler 81 2.80
## 12 Programmer 61 2.11
## 13 Data Miner 42 1.45
## 14 DBA/Database Engineer 42 1.45
## 15 Operations Research Practitioner 28 0.969
##
## [[2]]
## # A tibble: 3 x 3
## TitleFit count percent
## <chr> <int> <dbl>
## 1 Fine 1812 65.0
## 2 Perfectly 568 20.4
## 3 Poorly 407 14.6
##
## [[3]]
## # A tibble: 874 x 3
## PastJobTitlesSelect count percent
## <chr> <int> <dbl>
## 1 Researcher 218 8.02
## 2 Software Developer/Software Engineer 118 4.34
## 3 Other 111 4.09
## 4 Data Analyst 90 3.31
## 5 Engineer 73 2.69
## 6 I haven't started working yet 57 2.10
## 7 Programmer 55 2.02
## 8 Business Analyst 51 1.88
## 9 Programmer,Software Developer/Software Engineer 38 1.40
## 10 Data Scientist 36 1.32
## # ... with 864 more rows
##
## [[4]]
## # A tibble: 60 x 3
## CurrentEmployerType count perc~
## <chr> <int> <dbl>
## 1 Employed by college or university 663 23.4
## 2 Employed by a company that performs advanced analytics 485 17.1
## 3 Employed by professional services/consulting firm 459 16.2
## 4 Employed by a company that doesn't perform advanced analyt~ 303 10.7
## 5 Employed by company that makes advanced analytic software 264 9.32
## 6 Self-employed 189 6.67
## 7 Employed by government 159 5.61
## 8 Employed by non-profit or NGO 56 1.98
## 9 Employed by company that makes advanced analytic software,~ 46 1.62
## 10 Employed by professional services/consulting firm,Employed~ 37 1.31
## # ... with 50 more rows
## Create function to explore =("TitleFit","PastJobTitlesSelect","CurrentEmployerType") for current types of job positions
explore_current_job<- function(current_job,field){
#Load in df and column choice
#Academic_Size <- Academic_exploration(field,current_job)
#create data for ("TitleFit","PastJobTitlesSelect","CurrentEmployerType")
#"TitleFit
current_job_TitleFit <- Academic_exploration("TitleFit",current_job)
current_job_TitleFit$TitleFit<- factor( current_job_TitleFit$TitleFit, levels= current_job_TitleFit$TitleFit)
#"PastJobTitlesSelect
current_job_PastJob <- chooseMultiple("PastJobTitlesSelect",current_job)
current_job_PastJob <- current_job_PastJob[1:7,]
current_job_PastJob$selections<- factor( current_job_PastJob$selections, levels= current_job_PastJob$selections)
#"CurrentEmployerType"
current_job_CurrentEmployer<- chooseMultiple("CurrentEmployerType",current_job)
#current_job_CurrentEmployer <- current_job_CurrentEmployer[1:8,]
current_job_CurrentEmployer$selections<- factor( current_job_CurrentEmployer$selections, levels= current_job_CurrentEmployer$selections)
#Plots
plot_1 <- ggplot(current_job_TitleFit, aes(x = TitleFit,y=percent, fill = TitleFit)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
xlab("Job Fit")
plot_2 <-ggplot(current_job_PastJob, aes(x = selections,y=percent, fill = selections)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
xlab("Past Job Title")+
coord_flip()
plot_3 <-ggplot(current_job_CurrentEmployer, aes(x = selections,y=percent, fill = selections)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
xlab("Current Employer")+
coord_flip()
grid.arrange(plot_1,plot_2,plot_3,top=paste("Data scientists with current job title",field))
}
current_job <- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Scientist/Researcher"))
explore_current_job(current_job,"Scientist/Researcher")
current_job <- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Data Analyst"))
explore_current_job(current_job,"Data Analyst")
current_job<- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Machine Learning Engineer"))
explore_current_job(current_job,"Machine Learning Engineer")
current_job<- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Software Developer/Software Engineer"))
explore_current_job(current_job,"Software Developer/Software Engineer")
current_job <- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Business Analyst"))
explore_current_job(current_job,"Business Analyst")
current_job <- exp_df %>%
filter(CurrentJobTitleSelect%in%c("Statistician"))
explore_current_job(current_job,"Statistician")
US_only_df <- exp_df %>%
filter(Country%in%c('United States'))
str(US_only_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 734 obs. of 24 variables:
## $ id : int 2 3 4 9 10 11 12 17 19 20 ...
## $ GenderSelect : chr "Male" "Male" "Male" "Male" ...
## $ Country : chr "United States" "United States" "United States" "United States" ...
## $ Age : chr "56" "58" "25" "43" ...
## $ EmploymentStatus : chr "Independent contractor, freelancer, or self-employed" "Independent contractor, freelancer, or self-employed" "Employed part-time" "Retired" ...
## $ CurrentJobTitleSelect: chr "Operations Research Practitioner" "DBA/Database Engineer" "Researcher" "Software Developer/Software Engineer" ...
## $ TitleFit : chr "Poorly" "Poorly" "Fine" "Perfectly" ...
## $ PastJobTitlesSelect : chr "Business Analyst,Operations Research Practitioner,Predictive Modeler,Programmer,Other" "Data Analyst,DBA/Database Engineer,Programmer,Researcher,Software Developer/Software Engineer" NA "Computer Scientist,Programmer,Researcher,Software Developer/Software Engineer" ...
## $ CurrentEmployerType : chr "Self-employed" "Employed by professional services/consulting firm" "Employed by college or university" "Employed by a company that performs advanced analytics" ...
## $ FormalEducation : chr "Master's degree" "Master's degree" "Bachelor's degree" "Doctoral degree" ...
## $ MajorSelect : chr "Mathematics or statistics" "Mathematics or statistics" "Physics" "Computer Science" ...
## $ Tenure : chr "More than 10 years" "More than 10 years" "3 to 5 years" "More than 10 years" ...
## $ FirstTrainingSelect : chr "University courses" "Online courses (coursera, udemy, edx, etc.)" "University courses" "Self-taught" ...
## $ EmployerIndustry : chr "Mix of fields" "Technology" "Academic" "Financial" ...
## $ EmployerSize : chr NA NA "I don't know" NA ...
## $ EmployerSizeChange : chr NA NA "Increased significantly" NA ...
## $ EmployerMLTime : chr NA NA "Don't know" NA ...
## $ EmployerSearchMethod : chr NA NA "Some other way" NA ...
## $ UniversityImportance : chr "Very important" "Very important" "Important" NA ...
## $ JobFunctionSelect : chr "Analyze and understand data to influence product or business decisions" "Analyze and understand data to influence product or business decisions" "Research that advances the state of the art of machine learning" NA ...
## $ RemoteWork : chr NA "Most of the time" "Sometimes" NA ...
## $ CompensationAmount : chr "250000" "120000" "20000" NA ...
## $ CompensationCurrency : chr "USD" NA "USD" NA ...
## $ age_groups : Factor w/ 7 levels "18-22","23-30",..: 6 7 2 5 2 7 3 4 3 NA ...
US_only_df$CompensationAmount <- str_replace_all(US_only_df$CompensationAmount,"\\D+","")
us_money <- Academic_exploration("CompensationAmount",US_only_df)
us_money
## # A tibble: 121 x 3
## CompensationAmount count percent
## <chr> <int> <dbl>
## 1 100000 15 4.60
## 2 140000 15 4.60
## 3 150000 14 4.29
## 4 80000 11 3.37
## 5 90000 11 3.37
## 6 120000 10 3.07
## 7 50000 10 3.07
## 8 65000 10 3.07
## 9 125000 9 2.76
## 10 70000 9 2.76
## # ... with 111 more rows
my_dat <- create_breaks(us_money$CompensationAmount,breaks=c(0,30000,70000,11000,150000,Inf),labels=c('<30k','30-70k',"70-110k","110-150k","150k+"))
us_money$groups <- my_dat
ggplot(us_money, aes(x = groups,y=percent, fill = groups)) +
geom_bar(stat="identity")+
theme(legend.position="none")+
xlab("Current Employer")+
scale_x_discrete("groups", limits=c('<30k','30-70k',"70-110k","110-150k","150k+"))
##Gender
# gender_1 <- c(males=163,females=40)
# gender_2 <- c(males=152,females=51)
# proportion_function(gender_1)
# proportion_function(gender_2)
# ##AGE
# proportion_function(average_age_1)
# proportion_function(average_age_2)
# average_age_1 <-c("0-18"=54,111,145,129,92,75,55)
# average_age_2 <-c("0-18"=58,106,137,136,92,76,56)
##Education
# education_1 <- c(45,104,260)
# education_2 <- c(46,110,240)
# education_1 <- c("no_college"=45, "college"=104, "grad school"=260)
# education_2 <- c("no_college"=46, "college"=110, "grad school"=240)
#
##Salary
# 92.798
# salary_1 <- c(65,110,166,179)
# salary_2 <- c(62,111,170,192)
# identity_plots_45(majors,x='MajorSelect',y='percent',fill='MajorSelect')
##4th graph job fucntions coudlnt get it in
#Jobfunctionsselct
# current_job_JobFunction<- Academic_exploration("JobFunctionSelect",current_job)
# current_job_JobFunction <- current_job_JobFunction %>%
# filter(JobFunctionSelect%in%c("Build prototypes to explore applying machine learning to new areas","Analyze and understand data to #influence product or business decisions","Research that advances the state of the art of machine learning",
# "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
# "Build and/or run a machine learning service that operationally improves your product or workflows",NA,"Other"))
# current_job_JobFunction <- current_job_JobFunction[1:8,]
# current_job_JobFunction$JobFunctionSelect<- factor( current_job_JobFunction$JobFunctionSelect, levels= #current_job_JobFunction$JobFunctionSelect)
# plot_4 <-ggplot(current_job_JobFunction, aes(x = JobFunctionSelect,y=percent, fill = JobFunctionSelect)) +
# geom_bar(stat="identity")+
# theme(legend.position="none")
# theme(axis.text.x=element_text(angle=20,hjust=1))
# unique(exp_df$JobFunctionSelect)
# orig_names <- c("Build prototypes to explore applying machine learning to new areas",
# "Analyze and understand data to influence product or business decisions",
# "Research that advances the state of the art of machine learning",
# "Build and/or run the data infrastructure that your business uses for storing, analyzing, and operationalizing data",
# "Build and/or run a machine learning service that operationally improves your product or workflows",
# NA,
# "Other)
#
# new_names <-
#
#
```