This notebook uses the Data Science Survey on Kaggle dataset to understand the tools, preferred language and commonly used algorithms of data science practitioners in various working fields.
Load libraries
library(tidyverse)
library(ggdark)
library(viridis)
library(ggsci)
library(skimr)
Import data
data = read.csv("kagglesurvey.csv")
dim(data)
[1] 10153 5
head(data)
- 10153 survey responses in the dataset
str(data)
'data.frame': 10153 obs. of 5 variables:
$ Respondent : int 1 2 3 4 5 6 7 8 9 10 ...
$ WorkToolsSelect : chr "Amazon Web services,Oracle Data Mining/ Oracle R Enterprise,Perl" "Amazon Machine Learning,Amazon Web services,Cloudera,Hadoop/Hive/Pig,Impala,Java,Mathematica,MATLAB/Octave,Micr"| __truncated__ "C/C++,Jupyter notebooks,MATLAB/Octave,Python,R,TensorFlow" "Jupyter notebooks,Python,SQL,TensorFlow" ...
$ LanguageRecommendationSelect: chr "F#" "Python" "Python" "Python" ...
$ EmployerIndustry : chr "Internet-based" "Mix of fields" "Technology" "Academic" ...
$ WorkAlgorithmsSelect : chr "Neural Networks,Random Forests,RNNs" "Bayesian Techniques,Decision Trees,Random Forests,Regression/Logistic Regression" "Bayesian Techniques,CNNs,Ensemble Methods,Neural Networks,Regression/Logistic Regression,SVMs" "Bayesian Techniques,CNNs,Decision Trees,Gradient Boosted Machines,Neural Networks,Random Forests,Regression/Log"| __truncated__ ...
Dataset features:
- Respondent: id
- WorkToolsSelect: Tools used
- LanguageRecommendationSelect: Preferred Language
- EmployerIndustry: Working fields
- WorkAlgorithmsSelect: Algorithm commonly used by respondents
Missing data
# convert blanks to NA
data1 = mutate_all(data, list(~na_if(.,"")))
# missing values
#sapply(data1, function(x) sum(is.na(x)))
skim(data1)
── Data Summary ────────────────────────
Values
Name data1
Number of rows 8132
Number of columns 7
_______________________
Column type frequency:
character 4
numeric 3
________________________
Group variables None
── Variable type: character ───────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 WorkToolsSelect 177 0.978 1 834 0 5248 0
2 LanguageRecommendationSelect 1598 0.803 1 8 0 13 0
3 EmployerIndustry 29 0.996 5 32 0 16 0
4 WorkAlgorithmsSelect 831 0.898 4 216 0 1420 0
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 Respondent 0 1 4515. 2844. 1 2035. 4244. 6922. 10153 ▇▇▆▆▅
2 wt_counts 0 1 5.56 3.47 0 3 5 7 49 ▇▁▁▁▁
3 alg_count 0 1 3.28 2.43 0 1 3 5 15 ▇▅▁▁▁
# number of complete cases
data1 %>% filter(complete.cases(.)) %>% tally()
# drop obs with blanks across all columns except for ID
incomplete_df = data %>% filter(WorkToolsSelect=="", LanguageRecommendationSelect =="", EmployerIndustry =="", WorkAlgorithmsSelect =="")
cdf = anti_join(data, incomplete_df, by="Respondent")
dim(cdf)
[1] 9027 5
# drop obs with blanks in WorkToolsSelect, LanguageRecommendationSelect and WorkAlgorithmsSelect
incomplete_df2 = cdf %>% filter(WorkToolsSelect=="", LanguageRecommendationSelect =="", WorkAlgorithmsSelect =="")
cdf2 = anti_join(cdf, incomplete_df2, by="Respondent")
dim(cdf2)
[1] 8132 5
data = cdf2
- Out of 10153 observations, there are:
- 5991 complete cases
- 9027 obs have no blanks across all columns except for ID
- of which, 8132 obs have no blanks in WorkToolsSelect, LanguageRecommendationSelect and WorkAlgorithmsSelect
- The following sections uses the subset containing 8132 obs.
Preferred language
#plot
data %>% group_by(LanguageRecommendationSelect) %>% tally() %>% mutate_if(is.character,list(~na_if(.,""))) %>% ggplot(aes(x=reorder(LanguageRecommendationSelect,n), y=n)) + geom_col(width=0.8) + dark_theme_minimal() + labs(x="", y="Count", title="Preferred language") + coord_flip()

length(unique(data$LanguageRecommendationSelect))
[1] 14
# table (all levels)
data %>% group_by(LanguageRecommendationSelect) %>% tally(sort=T) %>% mutate_if(is.character,list(~na_if(.,""))) %>% mutate(prop=round(n/sum(n),3))
# table (excluding NA level)
data %>% filter(LanguageRecommendationSelect !="") %>% group_by(LanguageRecommendationSelect) %>% tally(sort=T) %>% mutate(prop=round(n/sum(n),3))
- 14 levels in the variable LanguageReccomendationSelect (preferred language), including one NA level
- 1598 out of 8132 respondents (19.7%) did not specify any language of preference
- of those that specified languages: 62% prefer Python, 25.6% prefer R and 4.2% prefer SQL
Working field
# plot
data %>% group_by(EmployerIndustry) %>% tally() %>% mutate_if(is.character,list(~na_if(.,""))) %>% ggplot(aes(x=reorder(EmployerIndustry,n), y=n)) + geom_col() + dark_theme_minimal() + labs(x="", y="Count", title="Working field") + coord_flip()

# summary table
length(unique(data$EmployerIndustry))
[1] 17
data %>% group_by(EmployerIndustry) %>% tally(sort=T) %>% mutate_if(is.character,list(~na_if(.,""))) %>% mutate(prop=round(n/sum(n),3))
- 29 out of 8132 respondents did not specify a working field
- Majority of the respondents (~46%) work in Technology, Academic or Financial fields.
Count of algorithms per response
data$alg_count = lengths(strsplit(data$WorkAlgorithmsSelect,","))
summary(data$alg_count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.000 3.000 3.277 5.000 15.000
# plot
data %>% group_by(alg_count) %>% tally() %>% ggplot(aes(x=alg_count, y=n)) + geom_col(width=0.8) + dark_theme_minimal() + labs(x="Number of algorithms per response", y="Count")

# summary table
data %>% group_by(alg_count) %>% tally() %>% mutate(prop=round(n/sum(n),3))
- Median of 3 algorithms and maximum of 15 algorithms listed
- 831 out of 8132 responses (10.2%) did not list any commonly used algorithms
- Around 62% of the respondents listed one to four commonly used algorithms
Algorithms user counts
# spilt then flatten
alg <- data %>%
mutate(WorkAlgorithmsSelect =str_split(WorkAlgorithmsSelect,",") ) %>%
unnest(WorkAlgorithmsSelect)
# unique levels
length(unique(alg$WorkAlgorithmsSelect))
[1] 16
# table
alg1 = alg %>% group_by(WorkAlgorithmsSelect) %>% tally(sort=T) %>% mutate_if(is.character,list(~na_if(.,"")))
alg1
# plot
alg1 %>% ggplot(aes(x=reorder(WorkAlgorithmsSelect,n), y=n)) + geom_col(width=0.8) + dark_theme_minimal() + labs(x="", y="Count", title="Commonly used Algorithms") + coord_flip()

- 16 unique levels in the variable WorkAlgorithmsSelect, including one NA level
- Three Most frequent commonly used algorithms listed are
- Regression/Logistic Regression (n=4636)
- Decision Trees (n=3460)
- Random Forest (n=3378)
- Least frequent commonly used algorithms listed by respondents is GANS (n=207)
Most frequent commonly used algorithms in each industry
# proportion of most frequent algorithm in respective industry
alg %>% filter(EmployerIndustry != "") %>% filter(WorkAlgorithmsSelect != "") %>% group_by(EmployerIndustry, WorkAlgorithmsSelect) %>% tally() %>% mutate(prop=round(n/sum(n),3)) %>% arrange(desc(prop), .by_group=TRUE) %>% group_by(EmployerIndustry) %>% slice(1) %>% as.data.frame()
- Regression/Logistic regression is the most frequent algorithm commonly used across working fields, except for Military/Security field where Neural Network is the most frequent.
# LanguageRecommendationSelect by industry
data %>% filter(EmployerIndustry != "") %>% filter(LanguageRecommendationSelect != "") %>% group_by(EmployerIndustry, LanguageRecommendationSelect) %>% tally() %>% mutate(prop=n/sum(n)) %>% arrange(desc(prop), .by_group=TRUE) %>% group_by(EmployerIndustry) %>% slice(1) %>% as.data.frame()
# plot
data %>% filter(LanguageRecommendationSelect !="") %>% filter(EmployerIndustry !="") %>% ggplot(aes(x=LanguageRecommendationSelect, y= fct_rev(EmployerIndustry), color=LanguageRecommendationSelect)) + geom_point() + theme_minimal() + labs(y="",x="") + scale_color_simpsons() + theme(legend.position="none", plot.background = element_rect(fill = "white"), panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())

- Python, R and SQL are listed as respondents’ preferred language across all(16) working fields.
