This is a data analytics project for mining analyzing, visualizing the data collected by the Kaggle Data science survey conducted in 2017.
require(data.table)
require(highcharter)
require(ggplot2)
require(tidyverse)
SurveyDf<-fread("../Datasets/kagglesurvey2017/multipleChoiceResponses.csv") #for faster data reading
##
Read 59.5% of 16817 rows
Read 16716 rows and 228 (of 228) columns from 0.023 GB file in 00:00:03
attach(SurveyDf)
by(Age,EmploymentStatus,summary)
## EmploymentStatus: Employed full-time
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 27.00 31.00 33.48 38.00 100.00 195
## --------------------------------------------------------
## EmploymentStatus: Employed part-time
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.00 22.00 25.00 27.51 30.00 72.00 16
## --------------------------------------------------------
## EmploymentStatus: I prefer not to say
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 22.00 25.00 28.24 31.75 100.00 38
## --------------------------------------------------------
## EmploymentStatus: Independent contractor, freelancer, or self-employed
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 28.00 36.00 38.33 47.00 100.00 28
## --------------------------------------------------------
## EmploymentStatus: Not employed, and not looking for work
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 9.00 20.00 22.00 24.46 25.00 100.00 15
## --------------------------------------------------------
## EmploymentStatus: Not employed, but looking for work
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.00 22.00 25.00 27.65 30.00 76.00 35
## --------------------------------------------------------
## EmploymentStatus: Retired
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.00 59.25 66.00 62.16 70.00 98.00 4
hcboxplot(x= Age , name="Age of participants", var = EmploymentStatus,color="purple",outliers = FALSE) %>%
hc_title(text="Boxplot of Ages of the participants and their Employment Status",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary()) %>%
hc_chart(type="column") #for vertical box plot
Hence the above Boxplot can actually tell us about the bi-variate relation and the statistical distribution between Ages and the Employment status of the survey participants. We can notice that the young participants between age of 20-25 are mostly Unemployed, of looking for job opportunities.
table(SurveyDf$StudentStatus)#most didn't fill this field
##
## No Yes
## 15436 299 981
#let's check if participants are learning DS or not
table(SurveyDf$LearningDataScience) #most of them didn't answered this too
##
##
## 15432
## No, I am not focused on learning data science skills
## 55
## Yes, but data science is a small part of what I'm focused on learning
## 429
## Yes, I'm focused on learning mostly data science skills
## 800
hchart(LearningDataScience,name="count",type="column",color="#99FF33") %>%
hc_title(text="Barplot of Learning Data science field",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
Hence we can notice that both these variables were not answered by more than 90% of the participants. So there is less scope of interpretting something from these.
table(CodeWriter)
## CodeWriter
## No Yes
## 3530 3033 10153
#so Most of them have entered Yes
hchart(CodeWriter,type="column",name="Count",color="#9645FF") %>%
hc_title(text="Barplot of Number of Coders",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
#let's now check which country has most coders
#Making a new dataframe grouped by country and Code wirter variables and summarized by the count of each
codeCountry<-SurveyDf %>% group_by(CodeWriter,Country) %>%
select(CodeWriter,Country) %>%
filter(CodeWriter %in% c("Yes","No")) %>%
summarize(total=n())
#getting top 10 countries and their total coders and non coders
TopCoders<-codeCountry %>% top_n(10) %>% arrange(desc(total))
hchart(TopCoders,type="column",name=c("Do not write code","Code Writers"),hcaes(x=Country,y=total,group=CodeWriter),color=c("black","#FF4040") ) %>%
hc_title(text="Barplot of Countries grouped by Coder Writers",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
table(CareerSwitcher) #more have changed their jobs than others who haven't
## CareerSwitcher
## No Yes
## 13704 886 2126
table(CurrentJobTitleSelect)
## CurrentJobTitleSelect
## Business Analyst
## 4886 796
## Computer Scientist Data Analyst
## 335 1213
## Data Miner Data Scientist
## 118 2433
## DBA/Database Engineer Engineer
## 187 552
## Machine Learning Engineer Operations Research Practitioner
## 617 58
## Other Predictive Modeler
## 1233 181
## Programmer Researcher
## 462 619
## Scientist/Researcher Software Developer/Software Engineer
## 978 1759
## Statistician
## 289
#let's make a dataframe to plot a barplot
jobdf<-as.data.frame(table(CurrentJobTitleSelect))
jobdf[1,1]<-"Not answered"
jobdf<-na.omit(jobdf)
jobdf %>% arrange(desc(Freq)) %>% hchart(hcaes(x=CurrentJobTitleSelect,y=Freq),name="Count",color="#751A75",type="column") %>%
hc_title(text="Barplot of Current Job titles of the participants",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
#let's see the trends countrywise
countryJobs<-SurveyDf %>% group_by(Country,CurrentJobTitleSelect) %>%
filter(Country %in% countryCount$Var1) %>%
select(Country,CurrentJobTitleSelect) %>%
summarize(total=n()) %>%
arrange(desc(total))
countryJobs[1:2,]<-NA
countryJobs<-na.omit(countryJobs)
#making separate countries for comparative plots of Jobs of that country's participants.
#Colors vectors for plotting
USIndJobs<-countryJobs %>% filter(Country %in% c("United States","India"))
colors <- c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff")
colors2<-c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff","7D8A16")
hchart(USIndJobs,type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
#USA INDIA and RUSSIA-Top 3 countries with maximum response
JobsMajorCountry<-countryJobs %>% filter(Country %in% c("United States","India","Russia"))
JobsMajorCountry[13,2]<-NA
hchart(na.omit(JobsMajorCountry),type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
The above plot shows the comparative current job titles of survey participants form India and USA.
1)From India Most of the participants were Software developers followed by Data scientists.
2)From USA most of the participants were Data scientists followed by software developers.
I will make a dataframe with only top 15 employers type.
#finding the top 15 employers type
Employer<-as.data.frame(table(CurrentEmployerType)) %>% top_n(15) %>% arrange(desc(Freq))
## Selecting by Freq
Employer[1,]<-NA
names(Employer)<-c("EmployerType","Count")
hchart(na.omit(Employer),type="column",hcaes(x=EmployerType,y=Count),color="#0E2E93") %>%
hc_title(text="Barplot of top 15 Type of Employers",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())
EmployerType<-as.data.frame(table(CurrentEmployerType,Country)) %>% top_n(20) %>% arrange(desc(Freq))
## Selecting by Freq
#assigning NA values to missing ones
EmployerType[c(1,2,8,12,13,16,19,20),1]<-NA
colnames(EmployerType)<-c("EmployerType","Country","Count")
#plotting data
hchart(na.omit(EmployerType),type="column",hcaes(x=EmployerType, y=Count,group=Country),color=c("#DE4968","#f1c40f","black")) %>%
hc_title(text="Barplot of Type of Employer and Country",align="center") %>%
hc_exporting(enabled=TRUE) %>%
hc_add_theme(hc_theme_elementary())