AIM

This is a data analytics project for mining analyzing, visualizing the data collected by the Kaggle Data science survey conducted in 2017.

let’s get started

Importing the dataset in R

require(data.table)
require(highcharter)
require(ggplot2)
require(tidyverse)

SurveyDf<-fread("../Datasets/kagglesurvey2017/multipleChoiceResponses.csv") #for faster data reading
## 
Read 59.5% of 16817 rows
Read 16716 rows and 228 (of 228) columns from 0.023 GB file in 00:00:03
attach(SurveyDf)

Exploratory data analysis

Age and Employment status

by(Age,EmploymentStatus,summary)
## EmploymentStatus: Employed full-time
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   27.00   31.00   33.48   38.00  100.00     195 
## -------------------------------------------------------- 
## EmploymentStatus: Employed part-time
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.00   22.00   25.00   27.51   30.00   72.00      16 
## -------------------------------------------------------- 
## EmploymentStatus: I prefer not to say
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   22.00   25.00   28.24   31.75  100.00      38 
## -------------------------------------------------------- 
## EmploymentStatus: Independent contractor, freelancer, or self-employed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   28.00   36.00   38.33   47.00  100.00      28 
## -------------------------------------------------------- 
## EmploymentStatus: Not employed, and not looking for work
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    9.00   20.00   22.00   24.46   25.00  100.00      15 
## -------------------------------------------------------- 
## EmploymentStatus: Not employed, but looking for work
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.00   22.00   25.00   27.65   30.00   76.00      35 
## -------------------------------------------------------- 
## EmploymentStatus: Retired
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.00   59.25   66.00   62.16   70.00   98.00       4
hcboxplot(x= Age , name="Age of participants", var = EmploymentStatus,color="purple",outliers = FALSE) %>%
  hc_title(text="Boxplot of Ages of the participants and their Employment Status",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary()) %>% 
  hc_chart(type="column") #for vertical box plot

Hence the above Boxplot can actually tell us about the bi-variate relation and the statistical distribution between Ages and the Employment status of the survey participants. We can notice that the young participants between age of 20-25 are mostly Unemployed, of looking for job opportunities.


Let’s see how many are students ?

table(SurveyDf$StudentStatus)#most didn't fill this field
## 
##          No   Yes 
## 15436   299   981
#let's check if participants are learning DS or not
table(SurveyDf$LearningDataScience) #most of them didn't answered this too
## 
##                                                                       
##                                                                 15432 
##                  No, I am not focused on learning data science skills 
##                                                                    55 
## Yes, but data science is a small part of what I'm focused on learning 
##                                                                   429 
##               Yes, I'm focused on learning mostly data science skills 
##                                                                   800
hchart(LearningDataScience,name="count",type="column",color="#99FF33") %>%
  hc_title(text="Barplot of Learning Data science field",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

Hence we can notice that both these variables were not answered by more than 90% of the participants. So there is less scope of interpretting something from these.


Let’s see how many are Coders

table(CodeWriter)
## CodeWriter
##          No   Yes 
##  3530  3033 10153
#so Most of them have entered Yes

hchart(CodeWriter,type="column",name="Count",color="#9645FF") %>%
  hc_title(text="Barplot of Number of Coders",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())
#let's now check which country has most coders

#Making a new dataframe grouped by country and Code wirter variables and summarized by the count of each
codeCountry<-SurveyDf %>% group_by(CodeWriter,Country) %>% 
  select(CodeWriter,Country) %>%
  filter(CodeWriter %in% c("Yes","No")) %>%
  summarize(total=n())
  
#getting top 10 countries and their total coders and non coders
TopCoders<-codeCountry %>% top_n(10) %>% arrange(desc(total))

hchart(TopCoders,type="column",name=c("Do not write code","Code Writers"),hcaes(x=Country,y=total,group=CodeWriter),color=c("black","#FF4040") ) %>%
  hc_title(text="Barplot of Countries grouped by Coder Writers",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

Who all have switched their Carriers and their Current Job titles

table(CareerSwitcher) #more have changed their jobs than others who haven't
## CareerSwitcher
##          No   Yes 
## 13704   886  2126
table(CurrentJobTitleSelect)
## CurrentJobTitleSelect
##                                                          Business Analyst 
##                                 4886                                  796 
##                   Computer Scientist                         Data Analyst 
##                                  335                                 1213 
##                           Data Miner                       Data Scientist 
##                                  118                                 2433 
##                DBA/Database Engineer                             Engineer 
##                                  187                                  552 
##            Machine Learning Engineer     Operations Research Practitioner 
##                                  617                                   58 
##                                Other                   Predictive Modeler 
##                                 1233                                  181 
##                           Programmer                           Researcher 
##                                  462                                  619 
##                 Scientist/Researcher Software Developer/Software Engineer 
##                                  978                                 1759 
##                         Statistician 
##                                  289
#let's make a dataframe to plot a barplot
jobdf<-as.data.frame(table(CurrentJobTitleSelect))
jobdf[1,1]<-"Not answered"
jobdf<-na.omit(jobdf)

jobdf %>% arrange(desc(Freq)) %>%   hchart(hcaes(x=CurrentJobTitleSelect,y=Freq),name="Count",color="#751A75",type="column") %>%
  hc_title(text="Barplot of Current Job titles of the participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())
#let's see the trends countrywise

countryJobs<-SurveyDf %>% group_by(Country,CurrentJobTitleSelect) %>%
        filter(Country %in% countryCount$Var1) %>%
        select(Country,CurrentJobTitleSelect) %>%
        summarize(total=n()) %>%
        arrange(desc(total))

countryJobs[1:2,]<-NA
countryJobs<-na.omit(countryJobs)

#making separate countries for comparative plots of Jobs of that country's participants.

#Colors vectors for plotting
USIndJobs<-countryJobs %>% filter(Country %in% c("United States","India"))
colors <- c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff")

colors2<-c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff","7D8A16")


hchart(USIndJobs,type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
  hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())
#USA INDIA and RUSSIA-Top 3 countries with maximum response 

JobsMajorCountry<-countryJobs %>% filter(Country %in% c("United States","India","Russia"))

JobsMajorCountry[13,2]<-NA



hchart(na.omit(JobsMajorCountry),type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
  hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

The above plot shows the comparative current job titles of survey participants form India and USA.

1)From India Most of the participants were Software developers followed by Data scientists.

2)From USA most of the participants were Data scientists followed by software developers.


Let’s check the where the participants were Employed?

I will make a dataframe with only top 15 employers type.

#finding the top 15 employers type
Employer<-as.data.frame(table(CurrentEmployerType)) %>% top_n(15) %>% arrange(desc(Freq))
## Selecting by Freq
Employer[1,]<-NA
names(Employer)<-c("EmployerType","Count")
hchart(na.omit(Employer),type="column",hcaes(x=EmployerType,y=Count),color="#0E2E93") %>%
  hc_title(text="Barplot of top 15 Type of Employers",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())
EmployerType<-as.data.frame(table(CurrentEmployerType,Country)) %>% top_n(20) %>% arrange(desc(Freq))
## Selecting by Freq
#assigning NA values to missing ones
EmployerType[c(1,2,8,12,13,16,19,20),1]<-NA

colnames(EmployerType)<-c("EmployerType","Country","Count")

#plotting data
hchart(na.omit(EmployerType),type="column",hcaes(x=EmployerType, y=Count,group=Country),color=c("#DE4968","#f1c40f","black")) %>%
  hc_title(text="Barplot of Type of Employer and Country",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())