Analyzing Stackoverflow developer survey

Overview

Stackoverflow recently released the anonymized results of their 2019 annual developer survey. The goal of this project is to analyse various trends for developers worldwide.

Data Wrangling

The dataset has 90,000 rows. Our goal is to filter R responses so that we can drill down further and analyze the trends related to R developers.

#reading necessary libraries
require(tidyverse)
require(countrycode)
require(highcharter)
require(ggplot2)

#reading the data
data <- read_csv("data/survey_results_public.csv")
attach(data)

#Let's check who all are developers
devTable <- as.data.frame(table(data$MainBranch))

Let’s create a braplot of the profession of the respondents.

hchart(devTable,hcaes(x=Var1,y=Freq),type="column",name="Count",color="#9B6ED8") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of occupation of participants",align="center") %>%
  hc_add_theme(hc_theme_elementary())

So we can notice that that most of the participants are developers.

Now let’s see how many developers have developement just as a hobby.

table(data$Hobbyist)

## 
##    No   Yes 
## 17626 71257

table(OpenSourcer)

## OpenSourcer
## Less than once a month but more than once per year 
##                                              20561 
##                            Less than once per year 
##                                              24972 
##                                              Never 
##                                              32295 
##                         Once a month or more often 
##                                              11055

Let’s check the Employment of the respondents.

round(prop.table(table(Employment))*100,2)

## Employment
##                                   Employed full-time 
##                                                73.92 
##                                   Employed part-time 
##                                                 5.13 
## Independent contractor, freelancer, or self-employed 
##                                                 9.76 
##               Not employed, and not looking for work 
##                                                 4.36 
##                   Not employed, but looking for work 
##                                                 6.42 
##                                              Retired 
##                                                 0.41

We can notice that 74 % of the respondents were Full time employees.

Now let’s check to which country the respondents belonged.

#Creating a data frame of the Country and its frequency.
countryDf <- data %>% count(Country) %>% 
  top_n(10)

#arranging in descending order of count
countryDf<-arrange(countryDf,desc(n))
  
hchart(countryDf,hcaes(x=Country,y=n),type="column",name="Count") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of Country of respondents",align="center") %>%
  hc_add_theme(hc_theme_elementary())

From the above plot we can notice that most of the respondents were from US,India followed by Germany.

table(Student)

## Student
##             No Yes, full-time Yes, part-time 
##          65816          15769           5429

We can notice that most of the respondents are not Students.

Let’s see which country had most respondents as Students ?

To achieve this let’s create a Dataframe of countries with highest number of Student respondents.

StudentDf <- data %>% filter(Student=="Yes, full-time") %>% 
  count(Country) %>% 
  top_n(30) %>% 
  arrange(desc(n))

StudentDf

So most of the Students were from India,not from US.

Let’s check the Education level for the major countries like India and US.

EdlevelIndia<- data %>% filter(Country == "India") %>% 
  group_by(EdLevel) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

#ignoring NA values
EdlevelIndia <- na.omit(EdlevelIndia)
  
#let's create a Barplot
hchart(EdlevelIndia,hcaes(x=EdlevelIndia$EdLevel,y=EdlevelIndia$count),type="column",color="red",name="Count") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot Education level of Indian Respondents",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s check the results for USA.

EdlevelUS<- data %>% filter(Country == "United States") %>% 
  group_by(EdLevel) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

#ignoring NA values
EdlevelIndia <- na.omit(EdlevelIndia)

# let's create a Barplot
hchart(EdlevelUS,hcaes(x=EdlevelUS$EdLevel,y=EdlevelUS$count),type="column",color="Green",name="Count") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot Education level of American Respondents",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s check the results for Germany.

Edlevelger<- data %>% filter(Country == "Germany") %>% 
  group_by(EdLevel) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

#let's create a Barplot
hchart(Edlevelger,hcaes(x=Edlevelger$EdLevel,y=Edlevelger$count),type="column",color="purple",name="Count") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot Education level of german Respondents",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s analyze the UnderGraduate majors of the Participants

table(UndergradMajor)

## UndergradMajor
##            A business discipline (ex. accounting, finance, marketing) 
##                                                                  1841 
##                   A health science (ex. nursing, pharmacy, radiology) 
##                                                                   323 
##         A humanities discipline (ex. literature, history, philosophy) 
##                                                                  1571 
##                   A natural science (ex. biology, chemistry, physics) 
##                                                                  3232 
##    A social science (ex. anthropology, psychology, political science) 
##                                                                  1352 
##    Another engineering discipline (ex. civil, electrical, mechanical) 
##                                                                  6222 
##       Computer science, computer engineering, or software engineering 
##                                                                 47214 
##  Fine arts or performing arts (ex. graphic design, music, studio art) 
##                                                                  1233 
##                                              I never declared a major 
##                                                                   976 
## Information systems, information technology, or system administration 
##                                                                  5253 
##                                             Mathematics or statistics 
##                                                                  2975 
##                                         Web development or web design 
##                                                                  3422

As expected most of them are Computer Science Graduates followed by Another Engineering disclipine and Information Systems.

Country and the under-graduate programmes in which the participants were enrolled.

#for India
UGIndia<- data %>% filter(Country == "India") %>% 
  group_by(UndergradMajor) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

UGIndia<-na.omit(UGIndia)

UGIndia

#for US
UGUS<- data %>% filter(Country == "United States") %>% 
  group_by(UndergradMajor) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

UGUS<-na.omit(UGUS)

UGUS

#for Germany
UGGermany<- data %>% filter(Country == "Germany") %>% 
  group_by(UndergradMajor) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count))

UGGermany<-na.omit(UGGermany)

UGGermany

We see almost similar trends for all the 3 major countries.

Let’s see what type of developers were there ?

#keeping only the respondents from top 10 countries.
DevTypeDf<- data %>% filter(Country %in% countryDf$Country) %>% 
  select(DevType) %>% 
  group_by(DevType) %>% 
  #creating a new column which has the count
  summarise(Count = n()) %>% 
  filter(Count > 100) %>% 
  arrange(desc(Count))

DevTypeDf<-na.omit(DevTypeDf)

DevTypeDf

So most of the respondents from the top 10 countries were Full stack developers and backend developers.

Let’s check if Indian,American and German respondents had what job ?

DevIndia<- data %>% filter(Country == "India") %>% 
  select(DevType) %>% 
  group_by(DevType) %>% 
  summarise(Count=n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(10)


DevUs<- data %>% filter(Country == "United States") %>% 
  select(DevType) %>% 
  group_by(DevType) %>% 
  summarise(Count=n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(10)



DevGer<- data %>% filter(Country == "Germany") %>% 
  select(DevType) %>% 
  group_by(DevType) %>% 
  summarise(Count=n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(10)

DevGer<- data %>% filter(Country == "Germany") %>% 
  select(DevType) %>% 
  group_by(DevType) %>% 
  summarise(Count=n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(10)

Creating the Plot:

hchart(DevIndia,hcaes(x=DevIndia$DevType,y=DevIndia$Count),type="column") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of top developers in India",align="center") %>%
  hc_add_theme(hc_theme_elementary())

hchart(DevUs,hcaes(x=DevUs$DevType,y=DevUs$Count),type="column") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of top developers in US",align="center") %>%
  hc_add_theme(hc_theme_elementary())

hchart(DevGer,hcaes(x=DevGer$DevType,y=DevGer$Count),type="column") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Barplot of top developers in Germany",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s check the Job satisfaction of the Respondents:

This is a really interesting feature to analyze. We can analyze which specific Job roles were most satisfied or in the list of top 10 countries how many respondents were satisfied with their Jobs etc.

table(JobSat)

## JobSat
## Neither satisfied nor dissatisfied              Slightly dissatisfied 
##                               8720                              10752 
##                 Slightly satisfied                  Very dissatisfied 
##                              24207                               4857 
##                     Very satisfied 
##                              22452

Let’s create a helper function

# Drilldown(dataset,Filter_by_var,filter_by_val,group_by_var)
DrillDown <- function(data,fil_var,val,gp_var) {
 
  attach(data)
  #Initializing new columns for the dataframe data
  data["fil_var"]<-NA
  data["gp_var"]<-NA
  
  #copying the values of args to the initialized columns which would be used in the data aggregation below.
  data$fil_var<-fil_var
  data$gp_var<-gp_var
  df <- data %>% filter(fil_var==val) %>% 
  group_by(gp_var) %>% 
  summarise(Count=n()) %>%
  arrange(desc(Count))
  
  df<-na.omit(df)
  names(df)[1] <- c("Satisfaction")
  
  df

  
  
  
}

Job satisfaction and Career satisfaction of Respondents from specific countries:

# !diagnostics off
JobSatIndia <- DrillDown(data,Country,"India",JobSat)

JobSatUS <-  DrillDown(data,Country,"United States",JobSat)

JobSatUk <- DrillDown(data,Country,"Argentina",JobSat)

JobSatChina <- DrillDown(data,Country,"China",JobSat)   

JobSatJapan <- DrillDown(data,Country,"Japan",JobSat)   


 
#Checking the career Satisfaction of different countries
CarSatIndia <-  DrillDown(data,Country,"India",CareerSat)

CarSatUS    <- DrillDown(data,Country,"United States",CareerSat)   

CarSatUk <- DrillDown(data,Country,"United Kingdom",CareerSat)  

CarSatChina <- DrillDown(data,Country,"China",CareerSat)   
CarSatJapan <- DrillDown(data,Country,"Japan",CareerSat)

For almost all the countries the Job and career satisfactions were same. Most of the people were SLightly Satisfied .

Whereas in Country like US, most of the respondents were Very Satisfied.

Only China had most of the respondents which were Slightly Satisfied and Slightly Dissatisfied .

Let’s analyze the top Job Factors

TOPJF <- data %>% group_by(JobFactors) %>% 
  summarise(Count=n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(11)

TOPJF <- na.omit(TOPJF)


hchart(TOPJF,hcaes(x=TOPJF$JobFactors,y=TOPJF$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s analyze the top Job factors for various Countries

JfIndia <- DrillDown(data,Country,"India",JobFactors)

JfIndia <- JfIndia %>% 
  top_n(10)


hchart(JfIndia,hcaes(x=JfIndia$Satisfaction,y=JfIndia$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for Indians",align="center") %>%
  hc_add_theme(hc_theme_elementary())

JfUk <- DrillDown(data,Country,"United States",JobFactors)

JfUk <- JfUk %>% 
  top_n(10)


hchart(JfUk,hcaes(x=JfUk$Satisfaction,y=JfUk$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for UK",align="center") %>%
  hc_add_theme(hc_theme_elementary())

So in India respondents mostly cared for factors such as ‘Languages, frameworks, and other technologies I’d be working with;Office environment or company culture;Opportunities for professional development’

JfUS <- DrillDown(data,Country,"United States",JobFactors)

JfUS <- JfUS %>% 
  top_n(10)


hchart(JfUS,hcaes(x=JfUS$Satisfaction,y=JfUS$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for US",align="center") %>%
  hc_add_theme(hc_theme_elementary())

JfGer <- DrillDown(data,Country,"Germany",JobFactors)

JfGer <- JfGer %>% 
  top_n(10)


hchart(JfGer,hcaes(x=JfGer$Satisfaction,y=JfGer$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for Germany",align="center") %>%
  hc_add_theme(hc_theme_elementary())

JfChina <- DrillDown(data,Country,"China",JobFactors)

JfChina <- JfChina %>% 
  top_n(10)



hchart(JfChina,hcaes(x=JfChina$Satisfaction,y=JfChina$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for China",align="center") %>%
  hc_add_theme(hc_theme_elementary())

JfJapan <- DrillDown(data,Country,"Japan",JobFactors)

JfJapan <- na.omit(JfJapan) %>% 
  top_n(10)



hchart(JfJapan,hcaes(x=JfJapan$Satisfaction,y=JfJapan$Count),type="column",color="#FDE725") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Job Factors which matter the most for Japan",align="center") %>%
  hc_add_theme(hc_theme_elementary())

Let’s check the Weekly Working hours of the Respondents

mean(na.omit(WorkWeekHrs))

## [1] 42.1272

CountryWorkHrs<- data %>% select(Country,WorkWeekHrs) %>% 
  group_by(Country) %>% 
  summarise(Mean =  mean(na.omit(WorkWeekHrs)))


# Finding the top 20 countries with highest average weekly work hours.
TopCountryWork<- CountryWorkHrs %>% arrange(desc(CountryWorkHrs$Mean)) %>% 
  top_n(20)



hchart(TopCountryWork,hcaes(x=TopCountryWork$Country,y=TopCountryWork$Mean),type="column",color="#ADE554") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Highest weekly work hours",align="center") %>%
  hc_add_theme(hc_theme_elementary())

So the average working hours were 42 hours.

We can notice that the Country where the weekly work hours were highest is Finland followed by Norway.

Let’s see the challenges that the Respondents face at work ?

WorkChallTop <- data %>% group_by(WorkChallenge) %>% 
  select(WorkChallenge) %>% 
  summarise(Count = n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(15)


WorkChallTop <- na.omit(WorkChallTop)

hchart(WorkChallTop,hcaes(x=WorkChallTop$WorkChallenge,y=WorkChallTop$Count),type="column",color="#FEA554") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Top Work challenges",align="center") %>%
  hc_add_theme(hc_theme_elementary())

The top Work challenges included things like :

Being tasked with non-development work.
Not enough people for the workload.
Distracting work environment.
Meetings. 5)Non-work commitments (parenting, school work, hobbies, etc.)
Toxic Work environment.

The top languages the respondents have worked with ?

TopLang <- data %>% group_by(LanguageWorkedWith) %>% 
  summarise(Count = n()) %>% 
  arrange(desc(Count)) %>% 
  top_n(20)

TopLang<-na.omit(TopLang)


hchart(TopLang,hcaes(x=TopLang$LanguageWorkedWith,y=TopLang$Count),type="column",color="#EAF253") %>%  
  hc_exporting(enabled = TRUE) %>%
  hc_title(text="Top Programming languages used By developers",align="center") %>%
  hc_add_theme(hc_theme_elementary())

So we can notice that the most used languages are HTML/CSS;JavaScript;PHP;SQL;C#.