Project 3: Machine Learning and Data Science Survey

Introduction: In 2018, Kaggle conducted a survey to gather information on the state of data science and machine learning around the world. A total of 23,859 responses were collected and was compiled for analysis and the raw data can be found via https://www.kaggle.com/kaggle/kaggle-survey-2018?select=SurveySchema.csv. The survey was given in the form of a multiple choice and free form response. For the purpose of this analysis, the multiple choice dataset was considered.

Purpose: This analysis studies a subset of the raw multiple choice data to better understand the skills currently required in industry in the US. The data scientist’s skill and experience will be measured by the highest level of degree earned, the type of degree earned, the type of software used (specialized or non-specific), the number of software a data scientist is proficient in, and the reported proficiency level of the software predominately used. Additionally, these will be compared globally and the analysis will determine if there is a geographical trend.

Gauge the level of education of data scientists in the US. Is there a correlation between the level of education and marketability of the data scientist? Is there a strong trend in the level of education that Data Scientist pursue?

Cursory look at the raw count per degree. Overall, there were significantly more people with a Master’s degree.

Load the data

urlfile<-"https://raw.githubusercontent.com/baruab/msdsrepo/main/Project_3_607/kaggle-survey-2018/multipleChoiceResponses.csv"
survey_raw<-read_csv(url(urlfile))

survey_tib <- survey_raw

names(survey_tib) <- paste(names(survey_tib),survey_tib[1,],sep="_")

#Remove row 1
survey_tib <- survey_tib[-c(1),]

Filter the data

Select only Data Scientist(DS) in the United States of America

survey_tib <- survey_tib %>% 
 filter((`Q3_In which country do you currently reside?`=='United States of America')&(`Q6_Select the title most similar to your current role (or most recent title if retired): - Selected Choice`=='Data Scientist'))
#write.csv(survey_tib,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/surv.csv", row.names = FALSE)

Select Required columns for Analysis

survey_tib <- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q6_","Q7_","Q13","Q16","Q21","Q33")))
#names(survey_tib)

Public datasets

Where do you find public datasets?

surv_dataset <- survey_tib %>% select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q33")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q33"),names_to="Dataset",values_to="Dataset_type")

surv_dataset_tib<-surv_dataset %>% 
  count(surv_dataset$Dataset_type) %>%
  rename("DataSource" = "surv_dataset$Dataset_type","Count"="n")%>%
  arrange(desc(Count))

#write.csv(surv_dataset,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/tibviz.csv", row.names = FALSE)

surv_dataset_tib<-surv_dataset_tib[-c(1),]
surv_dataset_tib2<-surv_dataset_tib %>% 
  mutate(proportion = round((surv_dataset_tib$Count /sum(surv_dataset_tib$Count))*100,2))

surv_dataset_tib2 %>% ggplot(aes(reorder(DataSource,proportion),proportion)) +
  geom_col(fill="#06272C")+coord_flip()+geom_text(aes(label=proportion),color="red") +labs(x="Data Sources")+theme_bw()

kbl(surv_dataset_tib2)%>%
  kable_styling() %>% kable_paper("hover", full_width = F)

DataSource	Count	proportion
Dataset aggregator/platform (Socrata, Kaggle Public Datasets Platform, etc.)	369	14.51
Google Search	367	14.43
Government websites	338	13.29
I collect my own data (web-scraping, etc.)	302	11.88
GitHub	280	11.01
University research group websites	214	8.42
Google Dataset Search	190	7.47
Publicly released data from private companies	171	6.72
Non-profit research group websites	149	5.86
None/I do not work with public data	138	5.43
Other	25	0.98

survey<-subset(survey_raw,select=c(5:8,13))
#filtering out data listed under data scientist only
survey_ds<-filter(survey,survey$Q6=="Data Scientist")
survey_ds<-subset(survey_ds,select=-c(4))
#renaming columns
colnames(survey_ds)<-c("country","highest_degree","undergrad_deg","salary")

Count by Educational Degree

p<-ggplot(survey_ds,aes(x=factor(highest_degree)))+
  geom_bar(stat="count",width=0.7,fill="steelblue")+
  theme_minimal()
q<-p+aes(stringr::str_wrap(highest_degree,15))+xlab(NULL)
q<-q+coord_flip()
survey_ds<-survey_ds%>%
  mutate(highest_degree=str_replace(highest_degree,"Some college/university study without earning a bachelor’s degree","Some College"))
survey_ds<-survey_ds%>%
  mutate(highest_degree=str_replace(highest_degree,"No formal education past high school","High School"))
p<-survey_ds%>%
  group_by(highest_degree)%>%
  summarise(count=n())%>%
  ggplot(aes(x=reorder(highest_degree,(count)),y=count))+
  geom_bar(stat="identity")+
  theme_minimal()+
  coord_flip()+
   xlab(NULL)+ylab("count")
p

Correlation between Salary and Degree

#Is there a correlation between salary and highest degree earned?
undergrad<-unique(survey_ds$undergrad_deg)
survey_ds$salary<-str_replace_all(survey_ds$salary,"(\\d|\\d+)-","") 
survey_ds$salary<-str_replace_all(survey_ds$salary,",","")
survey_ds$salary<-as.numeric(survey_ds$salary)
survey_ds$salary<-survey_ds$salary/1000

Filter on US Salary data

The following shows a stacked barplot that counts the number of data scientists per salary range, which is further divided between degree level earned in the US only. This is the raw count of all degree options offered in the survey. Some features to note is that data scientists in general earn less than $250K. Generally there are more data scientists that earned a Bachelor’s, Master’s, or Doctoral degree than those that partially completed a program, earned a certificate, or had no formal education. The overall salary mean in the US is $136K.

#US count only
#Is there a correlation between salary and highest degree earned?
survey_ds_US<-filter(survey_ds,country=="United States of America")
#raw plot with all values
ggplot(survey_ds_US%>%count(salary,highest_degree),
  aes(salary,n,fill=highest_degree))+
  geom_bar(stat="identity")+
  ggtitle("Salary and Highest Degree in the US")+
  xlab("salary K ($)")+ylab("count")

mean(survey_ds_US$salary,na.rm=TRUE)

## [1] 136.1321

Group by Salary range and Degree

This analysis will focus on the salary ranges between $0-$250K since there are more data points in those ranges. The data is further filtered to focus on those that had a Bachelor’s, Master’s and Doctoral degrees. The salary mean when looking at this truncated data drops to $123K from $135K.

#removing null data and looking at data bettween 0-100k salary
survey_f1<-filter(survey_ds_US,salary<250)
survey_f1<-filter(survey_f1,highest_degree!="I prefer not to answer")
#further filter out non-degree inputs
survey_f2<-filter(survey_f1,highest_degree=="Master’s degree"|highest_degree=="Bachelor’s degree"|highest_degree=="Doctoral degree")
ggplot(survey_f2%>%count(salary,highest_degree),
  aes(salary,n,fill=highest_degree))+
  geom_bar(stat="identity")+
  ggtitle("Salary and Highest Degree in the US")+
  xlab("salary K ($)")+ylab("count")

mean(survey_f2$salary,na.rm=TRUE)

## [1] 123.2369

Since it is difficult to determine if there is an obvious trend between salary and education level, the barplots are adjusted to show proportions of degree per salary rather than count. The adjusted bar plot shows that the proportion of Master’s and Doctoral degrees do not increase by an obvious margin as expected but rather stays consistent. However, the number of Bachelor degrees decrease generally.

#summarizing with proportions for the US
subset1<-subset(survey_f2,select=c("highest_degree","salary"))
MS<-filter(subset1,highest_degree=="Master’s degree")
BA<-filter(subset1,highest_degree=="Bachelor’s degree")
PHD<-filter(subset1,highest_degree=="Doctoral degree")
MS_count<-count(MS,salary)
BA_count<-count(BA,salary)
PHD_count<-count(PHD,salary)
PHD_count<-PHD_count%>%add_row(salary=50,n=0,.before=5)
deg_freq1<-data.frame(BA_count$n,MS_count$n,PHD_count$n)
row_sum<-rowSums(deg_freq1)
deg_freq1<-data.frame(BA_count$salary,BA_count$n,MS_count$n,PHD_count$n,row_sum)
BA_pct<-round(deg_freq1$BA_count.n/deg_freq1$row_sum*100)
BA_pct<-data.frame(BA[1:length(BA_pct),1],BA_pct,BA_count$salary)
colnames(BA_pct)<-c('highest degree','percent','salary')
MS_pct<-round(deg_freq1$MS_count.n/deg_freq1$row_sum*100)
MS_pct<-data.frame(MS[1:length(MS_pct),1],MS_pct,BA_count$salary)
colnames(MS_pct)<-c('highest degree','percent','salary')
PHD_pct<-round(deg_freq1$PHD_count.n/deg_freq1$row_sum*100)
PHD_pct<-data.frame(PHD[1:length(PHD_pct),1],PHD_pct,BA_count$salary)
colnames(PHD_pct)<-c('highest degree','percent','salary')
combined<-rbind(BA_pct,MS_pct,PHD_pct)

combined<-combined%>%
  group_by(salary)%>%
  arrange(salary,desc(`highest degree`))%>%
  mutate(lab_ypos=cumsum(percent)-0.5*percent)
ggplot(combined,aes(x=salary,y=percent))+
  geom_col(aes(fill=`highest degree`),width=8)+
  xlab("salary K ($)")+ylab("percent %")+
  ggtitle("Degree and Salary in the US <$250K")+
  geom_text(aes(y=lab_ypos,label=percent,group=`highest degree`),color="white")

When looking at the ranges above $250K, it is interesting to see that there are still some with Bachelor’s degrees only.

#For salary greater than 200k
survey_f3<-filter(survey_ds_US,salary>250)
survey_f3<-filter(survey_f3,highest_degree!="I prefer not to answer")
survey_f3<-filter(survey_f3,highest_degree=="Master’s degree"|highest_degree=="Bachelor’s degree"|highest_degree=="Doctoral degree")
ggplot(survey_f3%>%count(salary,highest_degree),
  aes(salary,n,fill=highest_degree))+
  geom_bar(stat="identity")+
  ggtitle("Salary and Highest Degree Globally >$250K")+
  xlab("salary K ($)")+ylab("count")

The data is further analyzed using proportions instead of count and people with Doctoral and Master’s degrees make up majority of the people who earn greater than $250K.

#summarizing with proportions for the US
subset3<-subset(survey_f3,select=c("highest_degree","salary"))
MS<-filter(subset3,highest_degree=="Master’s degree")
BA<-filter(subset3,highest_degree=="Bachelor’s degree")
PHD<-filter(subset3,highest_degree=="Doctoral degree")
MS_count<-count(MS,salary)
BA_count<-count(BA,salary)
PHD_count<-count(PHD,salary)
BA_count<-BA_count%>%add_row(salary=500,n=0)
deg_freq1<-data.frame(BA_count$n,MS_count$n,PHD_count$n)
row_sum<-rowSums(deg_freq1)
deg_freq1<-data.frame(BA_count$salary,BA_count$n,MS_count$n,PHD_count$n,row_sum)
BA_pct<-round(deg_freq1$BA_count.n/deg_freq1$row_sum*100)
BA_pct<-data.frame(BA[1:length(BA_pct),1],BA_pct,BA_count$salary)
colnames(BA_pct)<-c('highest degree','percent','salary')
MS_pct<-round(deg_freq1$MS_count.n/deg_freq1$row_sum*100)
MS_pct<-data.frame(MS[1:length(MS_pct),1],MS_pct,BA_count$salary)
colnames(MS_pct)<-c('highest degree','percent','salary')
PHD_pct<-round(deg_freq1$PHD_count.n/deg_freq1$row_sum*100)
PHD_pct<-data.frame(PHD[1:length(PHD_pct),1],PHD_pct,BA_count$salary)
colnames(PHD_pct)<-c('highest degree','percent','salary')
combined<-rbind(BA_pct,MS_pct,PHD_pct)
combined<-combined%>%
  group_by(salary)%>%
  arrange(salary,desc(`highest degree`))%>%
  mutate(lab_ypos=cumsum(percent)-0.5*percent)
ggplot(combined,aes(x=salary,y=percent))+
  geom_col(aes(fill=`highest degree`),width=50)+
  xlab("salary K ($)")+ylab("percent %")+
  ggtitle("Degree and Salary in the US <$250K")+
  geom_text(aes(y=lab_ypos,label=percent,group=`highest degree`),color="white")

Conclusion: It seems that in the salary ranges less than $250K, having a Bachelor’s, Master’s, or Doctoral degree makes up the largest proportions. There does not seem to be an obvious trend to show that either of these degrees will increase the likelihood of earning more within this range. However as you move into salary ranges greater than $250K, the majority of the people either have at least a Master’s and Doctoral. Surprisingly enough, there are still a good proportion of people with Bachelor’s degrees. The marketability of a data scientist may not fully depend on their level of education alone but other factors should be considered.

Tools and Libraries used by DS

What data visualization libraries or tools have you used in the past 5 years?

surv_viz_tool<- survey_tib %>% select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q21")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q21"),names_to="ToolName",values_to="Tool")

surv_viz_tool_tib <- surv_viz_tool %>%
  count(surv_viz_tool$Tool) %>% rename("VisualTools" = "surv_viz_tool$Tool","Count"="n")%>%
  arrange(desc(Count))

surv_viz_tool_tib <- surv_viz_tool_tib[-c(1),]
surv_viz_tool_tib2<-surv_viz_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>% arrange(desc(proportion))
  
surv_viz_tool_tib2 %>% ggplot(aes(reorder(VisualTools,proportion),proportion)) +
  geom_col()+coord_flip()+geom_col(fill="#A7ADBE")+
  geom_text(aes(label=proportion),color="red") +labs(x="Visual Tools")+theme_bw()

kbl(surv_viz_tool_tib2) %>%
  kable_styling() %>% kable_paper("hover", full_width = F)

VisualTools	Count	proportion
Matplotlib	689	21.46
ggplot2	565	17.60
Seaborn	527	16.42
Plotly	401	12.49
Shiny	308	9.60
Bokeh	208	6.48
D3	207	6.45
Leaflet	104	3.24
Lattice	75	2.34
Geoplotlib	49	1.53
Altair	33	1.03
Other	31	0.97
None	13	0.40

IDE(s) used

Which of the following integrated development environments (IDE’s) have you used at work or school in the last 5 years?

ide_tool<- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q13")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q13"),names_to="IDEName",values_to="IDEUsed")
#write.csv(surv_viz_tool,"C:/Users/newma/OneDrive/Desktop/MSDS Fall 2021/DATA 607 - Data Acquisition and Mgt/Project 3/survviz.csv", row.names = FALSE)

ide_tool_tib <- ide_tool %>%
  count(ide_tool$IDEUsed) %>% rename("IDE" = "ide_tool$IDEUsed","Count"="n")%>%
  arrange(desc(Count))

ide_tool_tib<-ide_tool_tib[-c(1),]
ide_tool_tib2<-ide_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))

ide_tool_tib2 %>% ggplot(aes(reorder(IDE,proportion),proportion)) +
  geom_col(fill="#18213A")+coord_flip()+
  geom_text(aes(label=proportion),color="red")+
  labs(x="IDE")+theme_bw()

kbl(ide_tool_tib2)%>%
  kable_styling() %>% kable_paper("hover", full_width = F)

IDE	Count	proportion
Jupyter/IPython	763	19.36
RStudio	586	14.87
Sublime Text	344	8.73
PyCharm	335	8.50
Notepad++	302	7.66
Spyder	298	7.56
Vim	287	7.28
Atom	251	6.37
MATLAB	203	5.15
Visual Studio Code	193	4.90
Visual Studio	166	4.21
IntelliJ	144	3.65
Other	52	1.32
nteract	16	0.41
None	2	0.05

Programming Languages used

What programming languages do you use on a regular basis?

lang_tool<- survey_tib %>% 
  select(starts_with(c("Q1_","Q2_","Q3_","Q7_","Q16")),-contains("OTHER_TEXT")) %>%
pivot_longer(starts_with("Q16"),names_to="LangName",values_to="LangUsed")

Lang_tool_tib <- lang_tool %>%
  count(lang_tool$LangUsed) %>% rename("Language" = "lang_tool$LangUsed","Count"="n")%>%
  arrange(desc(Count))

Lang_tool_tib<-Lang_tool_tib[-c(1),]
Lang_tool_tib2<-Lang_tool_tib %>%
  mutate(proportion = round((Count /sum(Count))*100,2))%>%arrange(desc(proportion))

Lang_tool_tib2 %>% ggplot(aes(reorder(Language,proportion),proportion)) +
  geom_col(fill='#633974')+coord_flip()+
  geom_text(aes(label=proportion),color="green")+ labs(x="Language")+theme_bw()

kbl(Lang_tool_tib2) %>%
  kable_styling() %>% kable_paper("hover", full_width = F)

Language	Count	proportion
Python	777	30.53
SQL	562	22.08
R	452	17.76
Bash	207	8.13
Javascript/Typescript	89	3.50
SAS/STATA	72	2.83
Java	64	2.51
MATLAB	64	2.51
Scala	62	2.44
C/C++	55	2.16
Visual Basic/VBA	43	1.69
C#/.NET	25	0.98
Other	23	0.90
Go	17	0.67
Julia	17	0.67
PHP	9	0.35
Ruby	5	0.20
None	2	0.08

Salary vs Number of Programming Languages used

# declare lists that will be used to preprocess data
x <- c(10000, 20000, 125000, 150000, 200000, 30000, 250000, 300000, 40000, 400000, 50000, 500000, 60000, 600000, 70000, 80000, 90000, 100000)
level <- c("0-10,000", "10-20,000", "100-125,000", "125-150,000", "150-200,000", "20-30,000", "200-250,000", "250-300,000", "30-40,000", "300-400,000", "40-50,000", "400-500,000", "50-60,000", "500,000+", "60-70,000", "70-80,000", "80-90,000", "90-100,000")


# select desired filters


a = survey_raw %>%
  filter(Q6=="Data Scientist")
a = a %>%
  filter(Q3 == "United States of America")
# Choose necessary variables
b <- a[,c(1:13,66:81,83,111:121,123,266:276)]
# remove irrelevant data
b <- b[b$Q9 != "I do not wish to disclose my approximate yearly compensation", ]  
b <- b[b$Q9 != "", ] 
# replace character data with numeric data
for (i in 1:length(level)) {
  b$Q9[b$Q9 == level[i]] <- x[i] 
}
b$Q9 <- as.numeric(b$Q9)

# change empty strings to NA for easier count
b$Q16_Part_1[b$Q16_Part_1 == ""] <- NA
b$Q16_Part_2[b$Q16_Part_2 == ""] <- NA
b$Q16_Part_3[b$Q16_Part_3 == ""] <- NA
b$Q16_Part_4[b$Q16_Part_4 == ""] <- NA
b$Q16_Part_5[b$Q16_Part_5 == ""] <- NA
b$Q16_Part_6[b$Q16_Part_6 == ""] <- NA
b$Q16_Part_7[b$Q16_Part_7 == ""] <- NA
b$Q16_Part_8[b$Q16_Part_8 == ""] <- NA
b$Q16_Part_9[b$Q16_Part_9 == ""] <- NA
b$Q16_Part_10[b$Q16_Part_10 == ""] <- NA
b$Q16_Part_11[b$Q16_Part_11 == ""] <- NA
b$Q16_Part_12[b$Q16_Part_12 == ""] <- NA
b$Q16_Part_13[b$Q16_Part_13 == ""] <- NA
b$Q16_Part_14[b$Q16_Part_14 == ""] <- NA
b$Q16_Part_15[b$Q16_Part_15 == ""] <- NA
b$Q16_Part_16[b$Q16_Part_16 == ""] <- NA
b$Q16_Part_18[b$Q16_Part_18 == ""] <- NA

# calculate count
c = 0
c <- b %>% rowwise() %>% mutate(Count=sum(!is.na(c_across(14:30))))
d <- count(c,as.factor(Count))

# scatterplot
ggplot(c, aes(x=Count, y=Q9)) +
  geom_point(position = position_jitter(width = .07, height = .07)) +
  scale_color_gradient(low='red', high='green') +
  ggtitle("Salary vs Number of Programming Languages used") +
  xlab("Programming Language by Count") +
  ylab("Salary") +
  stat_smooth(method = "lm", col = "red")

# linear regression to get the exact slope
lm(data=c,Q9~Count)

## 
## Call:
## lm(formula = Q9 ~ Count, data = c)
## 
## Coefficients:
## (Intercept)        Count  
##    134475.9        795.3

Conclusion: Salary vs Number of Programming Languages used

We can conclude that having more knowledge and ability in a handful of programming languages is more lucrative in terms of salary. The highest paid participants in the survey knew between 5 and 9 different programming languages. The data supports that it is possible to receive a high paying job at any domain within the scatter plot (5 to 13) in which there is a slight upward trend in Salary as the Number of Programming Languages increases. The linear regression output shows that for every unit increase of a Programming Language used increases the participant’s Salary by $795.30.

Salary vs Number of Visualization Tools used

# Change empty string to NA
b$Q21_Part_1[b$Q21_Part_1 == ""] <- NA
b$Q21_Part_2[b$Q21_Part_2 == ""] <- NA
b$Q21_Part_3[b$Q21_Part_3 == ""] <- NA
b$Q21_Part_4[b$Q21_Part_4 == ""] <- NA
b$Q21_Part_5[b$Q21_Part_5 == ""] <- NA
b$Q21_Part_6[b$Q21_Part_6 == ""] <- NA
b$Q21_Part_7[b$Q21_Part_7 == ""] <- NA
b$Q21_Part_8[b$Q21_Part_8 == ""] <- NA
b$Q21_Part_9[b$Q21_Part_9 == ""] <- NA
b$Q21_Part_10[b$Q21_Part_10 == ""] <- NA
b$Q21_Part_11[b$Q21_Part_11 == ""] <- NA
b$Q21_Part_13[b$Q21_Part_13 == ""] <- NA
# Calculate count
c = 0
c <- b %>% rowwise() %>% mutate(Count=sum(!is.na(c_across(31:42))))
d <- c %>% count(Count)
# scatterplot
ggplot(c, aes(x=Count, y=Q9)) +
  geom_point(position = position_jitter(width = .07, height = .07)) +
  ggtitle("Salary vs Number of Visualization Tools used") + 
  xlab("Visualization Tools") +
  ylab("Salary") +
  stat_smooth(method = "lm", col = "red")

# linear regression to get the exact slope
lm(data=c,Q9~Count)

## 
## Call:
## lm(formula = Q9 ~ Count, data = c)
## 
## Coefficients:
## (Intercept)        Count  
##   137005.45       -80.63

Conclusion: Salary vs Number of Visualization Tools used

Visualization tools is a key component for relaying information to others. Is it important to know a wide array of visualization tools? Or is it more beneficial to master a select few tools? The data shows that more high paying jobs favor mastery in a few visualization tools, rather than the knowledge of many tools. Based on the data presented in the plot below, the number of visualization tools does not significantly affect the overall salary of a data scientist. For every unit increase in the Number of Visualization Tools a participant uses, their Salary is expected to decrease by -$80.63.

Salary vs Number of Data Acquisition Methods used

# Change empty string to NA
b$Q33_Part_1[b$Q33_Part_1 == ""] <- NA
b$Q33_Part_2[b$Q33_Part_2 == ""] <- NA
b$Q33_Part_3[b$Q33_Part_3 == ""] <- NA
b$Q33_Part_4[b$Q33_Part_4 == ""] <- NA
b$Q33_Part_5[b$Q33_Part_5 == ""] <- NA
b$Q33_Part_6[b$Q33_Part_6 == ""] <- NA
b$Q33_Part_7[b$Q33_Part_7 == ""] <- NA
b$Q33_Part_8[b$Q33_Part_8 == ""] <- NA
b$Q33_Part_9[b$Q33_Part_9 == ""] <- NA
b$Q33_Part_10[b$Q33_Part_10 == ""] <- NA
b$Q33_Part_11[b$Q33_Part_11 == ""] <- NA
# Calculate count
c = 0
c <- b %>% rowwise() %>% mutate(Count=sum(!is.na(c_across(43:53))))
# scatterplot
ggplot(c, aes(x=Count, y=Q9)) +
  geom_point(position = position_jitter(width = .07, height = .07)) +
  ggtitle("Salary vs Number of Data Acquistion Methods used") +
  xlab("Data Acquistion Methods by Count") +
  ylab("Salary") +
  stat_smooth(method = "lm", col = "red")

# linear regression to get the exact slope
lm(data=c,Q9~Count)

## 
## Call:
## lm(formula = Q9 ~ Count, data = c)
## 
## Coefficients:
## (Intercept)        Count  
##      145080        -2936

Conclusion: Salary vs Number of Data Acquisition Methods used

Found that there is a negative relationship between Data Acquisition Methods used and Salary. This is likely due to most high paying data science jobs using internal data provide by their company rather than relying on other method of data acquisition. For every unit increase in the Number of Data Acquisition Methods a participant uses, their Salary is expected to decrease by -$2,936.

Load world map data

survey_df<-filter(survey_raw,survey_raw$Q6=="Data Scientist")
survey_df <- survey_df %>% filter(!Q3 %in% c("I do not wish to disclose my location", "Other"))

 data(country.map)

#save out country names as as dataframe
temp <- as.data.frame(country.map$region) %>% distinct()
temp$flag <- 1

#take survey data and select age, country, experience and income
#convert strings to numbers for age, experience and income
#group by country and take the average
#use distinct() to give us one row per country
temp2<- survey_df %>% select(Q2, Q3, Q8, Q9) %>% mutate(region = tolower(Q3), income = gsub('.*-','', Q9), income = as.numeric(gsub(',','', income)), experience = as.numeric(gsub('.*-| +','', Q8)), age = as.numeric(gsub('.*-','', Q2))) %>% group_by(region) %>% mutate(income = round(mean(income, na.rm = TRUE)), experience = round(mean(experience, na.rm = TRUE)), age = round(mean(age, na.rm = TRUE))) %>% select(region, income, experience, age) %>% distinct() %>% ungroup()


#join our survey data to country list to see which countries need a name change
temp3 <- left_join(temp2, temp, by = c("region" = "country.map$region"))


# Select the columns connected to question 16 and shaping it from wide to long
# filter out NA values and create a count of languages for each respondant
# group by country and calculate the average number of languages a respondant knows in each country
temp4 <- survey_df %>% mutate(id = row_number(), region = tolower(Q3)) %>%  select(id, region, starts_with("Q16")) %>% melt(id.vars = c("id", "region"), measure.vars = c("Q16_Part_1","Q16_Part_2","Q16_Part_3","Q16_Part_4","Q16_Part_5","Q16_Part_6","Q16_Part_7","Q16_Part_8","Q16_Part_9","Q16_Part_10","Q16_Part_11","Q16_Part_12","Q16_Part_13","Q16_Part_14","Q16_Part_15","Q16_Part_16","Q16_Part_17","Q16_Part_18","Q16_OTHER_TEXT")) %>% filter(!is.na(value)) %>% select(id, region, value) %>% group_by(id, region) %>% mutate(num_lang = n()) %>% ungroup() %>% group_by(region) %>% mutate(avg_lang = round(mean(num_lang),1)) %>% select(region, avg_lang) %>% distinct() %>% ungroup()

#fix poor mapping and filter out singapore and hong kong which are too small to show on map
country_data <- temp2 %>% mutate(region = case_when(region == 'viet nam' ~ 'vietnam',
                                    region == 'iran, islamic republic of...' ~ 'iran',
                                    region == 'united kingdom of great britain and northern ireland' ~ 'united kingdom',
                                    region == 'republic of korea' ~ 'north korea',
                                    TRUE ~ region
                                    )) %>% filter(!region %in% c('singapore', 'hong kong (s.a.r.)'))

#fix poor mapping and filter out singapore and hong kong which are too small to show on map
lang_country_data <- temp4 %>% mutate(region = case_when(region == 'viet nam' ~ 'vietnam',
                                    region == 'iran, islamic republic of...' ~ 'iran',
                                    region == 'united kingdom of great britain and northern ireland' ~ 'united kingdom',
                                    region == 'republic of korea' ~ 'north korea',
                                    TRUE ~ region
                                    )) %>% filter(!region %in% c('singapore', 'hong kong (s.a.r.)'))

Data Scientist Age by Country

age_df <- country_data %>% rename(value = age) %>% select(region, value)

country_choropleth(age_df,
                 title      = "Data Scientist Age by Country",
                 legend     = "Average Age",
                 num_colors = 9) + scale_fill_brewer(palette = "BuGn")

Data Scientist Income by Country

income_df <- country_data %>% rename(value = income) %>% select(region, value)

country_choropleth(income_df,
                 title      = "Data Scientist Income by Country",
                 legend     = "Average Income",
                 num_colors = 9) + scale_fill_brewer(palette = "YlOrRd")

Data Scientist Years Experience by Country

experience_df <- country_data %>% rename(value = experience) %>% select(region, value)

country_choropleth(experience_df,
                 title      = "Data Scientist Years Experience by Country",
                 legend     = "Average Years Experience",
                 num_colors = 9) + scale_fill_brewer(palette = "Greys")

Programming Languages known by Country

lang_df <- lang_country_data %>% rename(value = avg_lang) %>% select(region, value)

country_choropleth(experience_df,
                 title      = "Data Scientist Number of Programming Languages by Country",
                 legend     = "Average Number of Languages",
                 num_colors = 9) + scale_fill_brewer(palette = "Blues")