This is the part 3 of this project.This is a data analytics project for mining analyzing, visualizing the data collected by the Kaggle Data science survey conducted in 2017.
Part 3 - This section will analyze and study the professional lives of the participants, their major degree ,time spend studying data science topics, what job titles they hold,which ML method they actually use in the industries , which bolgs the participants prefer the most for studying data science etc.
blogs<-SurveyDf %>% group_by(BlogsPodcastsNewslettersSelect) %>%
summarise(count=n()) %>%
top_n(15) %>%
arrange(desc(count))
#removing NA value
blogs[1,1]<-NA
colnames(blogs)<-c("Blogname","Count")
#let's plot them
hchart(na.omit(blogs),hcaes(x=Blogname,y=Count),type="column",color="#062D67") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of most preferred blogs for learning",align="center") %>%
hc_add_theme(hc_theme_elementary())
hence one can see that one of the most famous and preferred blog sites are R bloggers and Kdnuggets.
table(LearningDataScienceTime)
## LearningDataScienceTime
## < 1 year 1-2 years 10-15 years 15+ years 3-5 years
## 12367 2093 1566 14 30 540
## 5-10 years
## 106
hchart(SurveyDf$LearningDataScienceTime,type="pie",name="count")
So most of the participants have started learning data science in the past year itself or its been less than a year since they started studying learning data science.
Let’s check the age distribution of the particpiants and for how long they have been learning data science.
hcboxplot(x=SurveyDf$Age,var=SurveyDf$LearningDataScienceTime,outliers = F,color="#09870D",name="Age Distribution") %>%
hc_chart(type="column") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Boxplot of ages and the learning time of participants",align="center") %>%
hc_add_theme(hc_theme_elementary())
The above plot was quiet predictable as people with less time learning data science are younger.
Let’s do some data wrangling and transformations.
Making a separate data frame for each variable, for easier understanding.
#let's make a function to ease things
#function takes argument as a dataframe and the categorical variable which we want summarize and group
aggr<-function(df,var)
{
require(dplyr)
var <- enquo(var) #quoting
dfname<-df %>%
group_by_at(vars(!!var)) %>% ## Group by variables selected by name:
summarise(count=n()) %>%
arrange(desc(count))
dfname#function returns a summarized dataframe
}
RSkill<-aggr(SurveyDf,JobSkillImportanceR)
RSkill[1,]<-NA
SqlSkill<-aggr(SurveyDf,JobSkillImportanceSQL)
SqlSkill[1,]<-NA
PythonSkill<-aggr(SurveyDf,JobSkillImportancePython)
PythonSkill[1,]<-NA
BigDataSkill<-aggr(SurveyDf,JobSkillImportanceBigData)
BigDataSkill[1,]<-NA
StatsSkill<-aggr(SurveyDf,JobSkillImportanceStats)
StatsSkill[1,]<-NA
DegreeSkill<-aggr(SurveyDf,JobSkillImportanceDegree)
DegreeSkill[1,]<-NA
EnterToolsSkill<-aggr(SurveyDf,JobSkillImportanceEnterpriseTools)
EnterToolsSkill[1,]<-NA
MOOCSkill<-aggr(SurveyDf,JobSkillImportanceMOOC)
MOOCSkill[1,]<-NA
DataVisSkill<-aggr(SurveyDf,JobSkillImportanceVisualizations)
DataVisSkill[1,]<-NA
KaggleRankSkill<-aggr(SurveyDf,JobSkillImportanceKaggleRanking)
KaggleRankSkill[1,]<-NA
hchart(na.omit(RSkill),hcaes(x=JobSkillImportanceR,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of R skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(PythonSkill),hcaes(x=JobSkillImportancePython,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Python skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(SqlSkill),hcaes(x=JobSkillImportanceSQL,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of SQL skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(BigDataSkill),hcaes(x=JobSkillImportanceBigData,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Big Data skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(StatsSkill),hcaes(x=JobSkillImportanceStats,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Statistics kill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(DataVisSkill),hcaes(x=JobSkillImportanceVisualizations,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Data Viz skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(DegreeSkill),hcaes(x=JobSkillImportanceDegree,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Degree",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(EnterToolsSkill),hcaes(x=JobSkillImportanceEnterpriseTools,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Enterprise Tools skill",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(MOOCSkill),hcaes(x=JobSkillImportanceMOOC,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of MOOCs",align="center") %>%
hc_add_theme(hc_theme_elementary())
hchart(na.omit(KaggleRankSkill),hcaes(x=JobSkillImportanceKaggleRanking,y=count),type="pie",name="Count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Piechart of importance of Kaggle Rankings",align="center") %>%
hc_add_theme(hc_theme_elementary())
We can see from the above plot that the most unnecessary skill amongst all is having a knowledge of Enterprise tools, Degree, Kaggle Rankings and MOOCs. These have higher count of unnecessary skills entered by the participants.
Whereas, Knowledge of Statistics,Python,R and Big data skills are most necessary and Nice to have skills as per answers entered by the survey participants.
knowlegdeDf<-SurveyDf %>% group_by(ProveKnowledgeSelect) %>%
summarise(count=n()) %>%
arrange(desc(count))
knowlegdeDf[1,]<-NA
hchart(na.omit(knowlegdeDf),hcaes(x=ProveKnowledgeSelect,y=count),type="column",color="#049382",name="count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of what proves you have Datascience knowledge",align="center") %>%
hc_add_theme(hc_theme_elementary())
Let’s now heck the formal education of participants:
table(FormalEducation)
## FormalEducation
##
## 1701
## Bachelor's degree
## 4811
## Doctoral degree
## 2347
## I did not complete any formal education past high school
## 257
## I prefer not to answer
## 90
## Master's degree
## 6273
## Professional degree
## 451
## Some college/university study without earning a bachelor's degree
## 786
Let’s check the most famous machine learning technique in which participants consider themselves competent?
Mltechique<-SurveyDf %>% group_by(MLTechniquesSelect) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
top_n(20)
## Selecting by count
Mltechique[1,]<-NA
hchart(na.omit(Mltechique),hcaes(x=MLTechniquesSelect,y=count),type="column",color="purple",name="count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of competent ML techniques of participants",align="center") %>%
hc_add_theme(hc_theme_elementary())
So we cant notice that Logistic regression, Decision trees, Random forets are the top 2 most competent techniques in which the participants are competent and can successfully implement and are most efficient in implementing.
Now we will check which machine learning algorithm is most used by the participants at their work.
MLalgoWork<-SurveyDf %>% group_by(WorkAlgorithmsSelect) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
top_n(20)
## Selecting by count
MLalgoWork[c(1,3),]<-NA
hchart(na.omit(MLalgoWork),hcaes(x=WorkAlgorithmsSelect,y=count),type="column",color="green",name="count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of Most used ML algorithms at Work",align="center") %>%
hc_add_theme(hc_theme_elementary())
Again as we can see from the above plot, Regression,Logistic regression and decision trees lead the pack as the most used learning algorithms which used at work by participants.
This field answers -For work, which data science/analytics tools, technologies, and languages the participants have used in the past year?
We are going to find tht top 20 tools.
ToolatWork<-SurveyDf %>% group_by(WorkToolsSelect) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
top_n(20)
## Selecting by count
ToolatWork[c(1),]<-NA
hchart(na.omit(ToolatWork),hcaes(x=WorkToolsSelect,y=count),type="column",color="#7C0E3E",name="count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of Most used data science tools used at Work",align="center") %>%
hc_add_theme(hc_theme_elementary())
From the above plot we can see that Python and R are collectively used by datascientists the most as entered by the survey participants. Hence Python and R still tops the most used tools at work according to the survey.
MethodatWork<-SurveyDf %>% group_by(WorkMethodsSelect
) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
top_n(20)
## Selecting by count
MethodatWork[c(1,3),]<-NA
hchart(na.omit(MethodatWork),hcaes(x=WorkMethodsSelect
,y=count),type="column",color="#F14B5B",name="count") %>%
hc_exporting(enabled = TRUE) %>%
hc_title(text="Barplot of Most used ML and DS methods used at Work",align="center") %>%
hc_add_theme(hc_theme_elementary())