Analyzing Kaggle Data Science Survey Data-2017

AIM

This is a data analytics project for mining analyzing, visualizing the data collected by the Kaggle Data science survey conducted in 2017.

let’s get started

Importing the dataset in R

require(data.table)
require(highcharter)
require(ggplot2)
require(tidyverse)

SurveyDf<-fread("../Datasets/kagglesurvey2017/multipleChoiceResponses.csv") #for faster data reading

## 
Read 59.5% of 16817 rows
Read 16716 rows and 228 (of 228) columns from 0.023 GB file in 00:00:03

attach(SurveyDf)

Exploratory data analysis

1) Part 1- Analyzing country,ethnicity, gender,age,employment status, learning datascience and other related basic features of the survey participants.

Checking the distribution of each variable-

attach(SurveyDf)


table(SurveyDf$GenderSelect)

## 
##                                                   
##                                                95 
##                              A different identity 
##                                               159 
##                                            Female 
##                                              2778 
##                                              Male 
##                                             13610 
## Non-binary, genderqueer, or gender non-conforming 
##                                                74

table(SurveyDf$Country)

## 
##                                               Argentina 
##                         121                          92 
##                   Australia                     Belarus 
##                         421                          54 
##                     Belgium                      Brazil 
##                          91                         465 
##                      Canada                       Chile 
##                         440                          51 
##                    Colombia              Czech Republic 
##                         113                          53 
##                     Denmark                       Egypt 
##                          78                          66 
##                     Finland                      France 
##                          67                         442 
##                     Germany                      Greece 
##                         460                          81 
##                   Hong Kong                     Hungary 
##                          65                          66 
##                       India                   Indonesia 
##                        2704                         131 
##                        Iran                     Ireland 
##                         112                          94 
##                      Israel                       Italy 
##                         105                         238 
##                       Japan                       Kenya 
##                         277                          59 
##                    Malaysia                      Mexico 
##                          79                         126 
##                 Netherlands                 New Zealand 
##                         205                          74 
##                     Nigeria                      Norway 
##                          73                          53 
##                       Other                    Pakistan 
##                        1023                         161 
## People 's Republic of China                 Philippines 
##                         471                          84 
##                      Poland                    Portugal 
##                         184                          93 
##           Republic of China                     Romania 
##                          67                          59 
##                      Russia                   Singapore 
##                         578                         184 
##                South Africa                 South Korea 
##                         127                         194 
##                       Spain                      Sweden 
##                         320                          89 
##                 Switzerland                      Taiwan 
##                         129                         254 
##                      Turkey                     Ukraine 
##                         144                         196 
##              United Kingdom               United States 
##                         535                        4197 
##                     Vietnam 
##                          71

summary(na.omit(SurveyDf$Age))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   25.00   30.00   32.37   37.00  100.00

table(SurveyDf$EmploymentStatus)

## 
##                                   Employed full-time 
##                                                10897 
##                                   Employed part-time 
##                                                  917 
##                                  I prefer not to say 
##                                                  420 
## Independent contractor, freelancer, or self-employed 
##                                                 1330 
##               Not employed, and not looking for work 
##                                                  924 
##                   Not employed, but looking for work 
##                                                 2110 
##                                              Retired 
##                                                  118

Let’s visualize the above varibles to understand the distribution better I will use the highcharter and ggplot2 packages.

#barplot of Gender
hchart(SurveyDf$GenderSelect,type="bar",name="count",color="green") %>%
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of gender",align="center") %>%
  hc_add_theme(hc_theme_elementary())

#barplot of Emp_status
hchart(SurveyDf$EmploymentStatus,type="bar",name="count",color="red") %>% 
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of Employment Status",align="center") %>%
  hc_add_theme(hc_theme_elementary())

#barplot of country
hchart(SurveyDf$Country,type="bar",name="Count",color="blue") %>% 
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of Country or participants",align="center") %>%
  hc_add_theme(hc_theme_elementary())

#treemap of top 10  countries  of participant
countryCount<-as.data.frame(table(SurveyDf$Country)) %>%  top_n(10) 
hchart(countryCount,hcaes(Var1,value=Freq,color=Freq),name="Count of participants",type="treemap") %>%
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Tree map of top 10 countries of participants",align="center") %>%
  hc_add_theme(hc_theme_elementary())

So most of the participants were Male and were full time employed. Secondly most of the participants are form USA followed by India in the second place. This shows that people from these countries are very much interested in Data science and its related subfields, also shows large number of people in the field of Datascience.

Let’s check the distribution of the ages of participants–

#histogram of age of participants
summary(na.omit(SurveyDf$Age))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   25.00   30.00   32.37   37.00  100.00

#mean > median - positively skewed


hchart(na.omit(SurveyDf$Age),name="count",color="orange") %>%
  hc_title(text="Histogram of Ages of the participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

The distribution is somewhat positively skewed as we can notice the slight long tailed histogram which indicates that the mean(age) > median(age) > mode(age) values. From the above histogram the highest count is of participants form age 20 to 25. This indicates that most of the students, young job seekers, freshers etc have taken this survey and are interested in Data Science and its related domains.

We can also check this relationship between Age and employment status of the participants using a Boxplot.

Age and Employment status

by(Age,EmploymentStatus,summary)

## EmploymentStatus: Employed full-time
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   27.00   31.00   33.48   38.00  100.00     195 
## -------------------------------------------------------- 
## EmploymentStatus: Employed part-time
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.00   22.00   25.00   27.51   30.00   72.00      16 
## -------------------------------------------------------- 
## EmploymentStatus: I prefer not to say
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   22.00   25.00   28.24   31.75  100.00      38 
## -------------------------------------------------------- 
## EmploymentStatus: Independent contractor, freelancer, or self-employed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   28.00   36.00   38.33   47.00  100.00      28 
## -------------------------------------------------------- 
## EmploymentStatus: Not employed, and not looking for work
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    9.00   20.00   22.00   24.46   25.00  100.00      15 
## -------------------------------------------------------- 
## EmploymentStatus: Not employed, but looking for work
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.00   22.00   25.00   27.65   30.00   76.00      35 
## -------------------------------------------------------- 
## EmploymentStatus: Retired
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.00   59.25   66.00   62.16   70.00   98.00       4

hcboxplot(x= Age , name="Age of participants", var = EmploymentStatus,color="purple",outliers = FALSE) %>%
  hc_title(text="Boxplot of Ages of the participants and their Employment Status",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary()) %>% 
  hc_chart(type="column") #for vertical box plot

Hence the above Boxplot can actually tell us about the bi-variate relation and the statistical distribution between Ages and the Employment status of the survey participants. We can notice that the young participants between age of 20-25 are mostly Unemployed, of looking for job opportunities.

Let’s see how many are students ?

table(SurveyDf$StudentStatus)#most didn't fill this field

## 
##          No   Yes 
## 15436   299   981

#let's check if participants are learning DS or not
table(SurveyDf$LearningDataScience) #most of them didn't answered this too

## 
##                                                                       
##                                                                 15432 
##                  No, I am not focused on learning data science skills 
##                                                                    55 
## Yes, but data science is a small part of what I'm focused on learning 
##                                                                   429 
##               Yes, I'm focused on learning mostly data science skills 
##                                                                   800

hchart(LearningDataScience,name="count",type="column",color="#99FF33") %>%
  hc_title(text="Barplot of Learning Data science field",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

Hence we can notice that both these variables were not answered by more than 90% of the participants. So there is less scope of interpretting something from these.

Let’s see how many are Coders

table(CodeWriter)

## CodeWriter
##          No   Yes 
##  3530  3033 10153

#so Most of them have entered Yes

hchart(CodeWriter,type="column",name="Count",color="#9645FF") %>%
  hc_title(text="Barplot of Number of Coders",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

#let's now check which country has most coders

#Making a new dataframe grouped by country and Code wirter variables and summarized by the count of each
codeCountry<-SurveyDf %>% group_by(CodeWriter,Country) %>% 
  select(CodeWriter,Country) %>%
  filter(CodeWriter %in% c("Yes","No")) %>%
  summarize(total=n())
  
#getting top 10 countries and their total coders and non coders
TopCoders<-codeCountry %>% top_n(10) %>% arrange(desc(total))

hchart(TopCoders,type="column",name=c("Do not write code","Code Writers"),hcaes(x=Country,y=total,group=CodeWriter),color=c("black","#FF4040") ) %>%
  hc_title(text="Barplot of Countries grouped by Coder Writers",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

Who all have switched their Carriers and their Current Job titles

table(CareerSwitcher) #more have changed their jobs than others who haven't

## CareerSwitcher
##          No   Yes 
## 13704   886  2126

table(CurrentJobTitleSelect)

## CurrentJobTitleSelect
##                                                          Business Analyst 
##                                 4886                                  796 
##                   Computer Scientist                         Data Analyst 
##                                  335                                 1213 
##                           Data Miner                       Data Scientist 
##                                  118                                 2433 
##                DBA/Database Engineer                             Engineer 
##                                  187                                  552 
##            Machine Learning Engineer     Operations Research Practitioner 
##                                  617                                   58 
##                                Other                   Predictive Modeler 
##                                 1233                                  181 
##                           Programmer                           Researcher 
##                                  462                                  619 
##                 Scientist/Researcher Software Developer/Software Engineer 
##                                  978                                 1759 
##                         Statistician 
##                                  289

#let's make a dataframe to plot a barplot
jobdf<-as.data.frame(table(CurrentJobTitleSelect))
jobdf[1,1]<-"Not answered"
jobdf<-na.omit(jobdf)

jobdf %>% arrange(desc(Freq)) %>%   hchart(hcaes(x=CurrentJobTitleSelect,y=Freq),name="Count",color="#751A75",type="column") %>%
  hc_title(text="Barplot of Current Job titles of the participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

#let's see the trends countrywise

countryJobs<-SurveyDf %>% group_by(Country,CurrentJobTitleSelect) %>%
        filter(Country %in% countryCount$Var1) %>%
        select(Country,CurrentJobTitleSelect) %>%
        summarize(total=n()) %>%
        arrange(desc(total))

countryJobs[1:2,]<-NA
countryJobs<-na.omit(countryJobs)

#making separate countries for comparative plots of Jobs of that country's participants.

#Colors vectors for plotting
USIndJobs<-countryJobs %>% filter(Country %in% c("United States","India"))
colors <- c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff")

colors2<-c("#d35400", "#2980b9", "#2ecc71", "#f1c40f", "#2c3e50", "#7f8c8d","#000004", "#3B0F70", "#8C2981", "#DE4968", "#FE9F6D", "#FCFDBF","#ffb3b3","#66ff33","#00b3b3","#4d4dff","7D8A16")


hchart(USIndJobs,type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
  hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

#USA INDIA and RUSSIA-Top 3 countries with maximum response 

JobsMajorCountry<-countryJobs %>% filter(Country %in% c("United States","India","Russia"))

JobsMajorCountry[13,2]<-NA



hchart(na.omit(JobsMajorCountry),type="column",hcaes(Country, y=total,group=CurrentJobTitleSelect),color=colors) %>%
  hc_title(text="Barplot of Jobs of Country's participants",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

The above plot shows the comparative current job titles of survey participants form India and USA.

1)From India Most of the participants were Software developers followed by Data scientists.

2)From USA most of the participants were Data scientists followed by software developers.

Let’s check the where the participants were Employed?

I will make a dataframe with only top 15 employers type.

#finding the top 15 employers type
Employer<-as.data.frame(table(CurrentEmployerType)) %>% top_n(15) %>% arrange(desc(Freq))

## Selecting by Freq

Employer[1,]<-NA
names(Employer)<-c("EmployerType","Count")
hchart(na.omit(Employer),type="column",hcaes(x=EmployerType,y=Count),color="#0E2E93") %>%
  hc_title(text="Barplot of top 15 Type of Employers",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())

EmployerType<-as.data.frame(table(CurrentEmployerType,Country)) %>% top_n(20) %>% arrange(desc(Freq))

## Selecting by Freq

#assigning NA values to missing ones
EmployerType[c(1,2,8,12,13,16,19,20),1]<-NA

colnames(EmployerType)<-c("EmployerType","Country","Count")

#plotting data
hchart(na.omit(EmployerType),type="column",hcaes(x=EmployerType, y=Count,group=Country),color=c("#DE4968","#f1c40f","black")) %>%
  hc_title(text="Barplot of Type of Employer and Country",align="center") %>%
  hc_exporting(enabled=TRUE) %>%
  hc_add_theme(hc_theme_elementary())