PART 1: Basic Analysis
Load the Project’s Libraries
library(tidyverse)
library(plotly)
Get the Data
file_name = file.path(getwd(),"h1b_kaggle.csv.zip")
messy_data = read_csv(file_name, progress=FALSE)
dim(messy_data)
[1] 3002458 11
knitr::kable(head(messy_data) %>% select(-X1,-lon,-lat), digits=0)
Tidy the Data
Remove Defective Rows and Useless Columns
# Remove the index column
tidy_data = tidy_data %>% select(-X1)
# Discard rows with NA (overlook the lan and lon variables)
tidy_data = tidy_data %>% drop_na(-lon, -lat)
Convert Variables Types
tidy_data$CASE_STATUS = tidy_data %>% .$CASE_STATUS %>% as.factor()
tidy_data$EMPLOYER_NAME = tidy_data %>% .$EMPLOYER_NAME %>% as.factor()
tidy_data$SOC_NAME = tidy_data %>% .$SOC_NAME %>% as.factor()
tidy_data$JOB_TITLE = tidy_data %>% .$JOB_TITLE %>% as.factor()
tidy_data$FULL_TIME_POSITION = tidy_data %>% .$FULL_TIME_POSITION %>% as.factor()
tidy_data$WORKSITE = tidy_data %>% .$WORKSITE %>% as.factor()
Make Factor Levels Insensitive for Capitalization
e.g. Economists = economists are assigned as the same factor level
tidy_data = tidy_data %>%
map_if(is.factor, fct_relabel, fun=function(x) {x=str_to_title(x)}) %>%
as_tibble()
PART 2: Exploratory Data Analysis
Data Scientist Data Set Preprocessing
Filter Data Science Jobs
According to a recent survey in the data analytics domain, this is how practitioners call themselves:
- Data Scientist (31%)
- Data Analyst (12%)
- Researcher (11%)
- Business Analyst (9%)
- Statistician (6%)
- Predictive Modeler (5%)
- Other Job Titles (26%)
jobs_titles = c('Data Scientist',
'Data Analyst',
'Business Analyst',
'Statistician',
# Convert the job title vector into one regular expression query
jobs_pattern = jobs_titles %>% sapply(function(x) paste0("^",x,"*")) %>% paste(collapse='|')
tidy_jobs = tidy_data %>% filter(str_detect(JOB_TITLE,jobs_pattern))
# Drop unused facotor levels
tidy_jobs$JOB_TITLE = tidy_jobs %>% .[["JOB_TITLE"]] %>% fct_drop()
There are 1625 matching job titles and 52975 subsequent petitions
Collapse the different job title variations into their basic form of 5 titles:
Data Scientist, Data Analyst, Business Analyst, Statistician, Predictive Modeler
for(jobs_title in jobs_titles) {
# Find the rows which correspond the the current job title
rows_index = tidy_jobs %>% .$JOB_TITLE %>% str_detect(paste0("^",jobs_title,"*"))
# Replace the different title variations with their stem
tidy_jobs[rows_index,"JOB_TITLE"] = jobs_title
}
# Drop unused facotor levels
tidy_jobs$JOB_TITLE = tidy_jobs %>% .[["JOB_TITLE"]] %>% fct_drop()
There are 5 matching job titles and 52975 subsequent petitions
Business Analyst |
44471 |
84 |
Data Analyst |
5036 |
10 |
Data Scientist |
2234 |
4 |
Statistician |
1213 |
2 |
Predictive Modeler |
21 |
0 |
Remove Salary’s Outliers
Find the 2.5% and 97.5% quantiles and consider only records within these borders.
lims = tidy_jobs %>% .$PREVAILING_WAGE %>% quantile(c(2.5,97.5)/100)
tidy_jobs = tidy_jobs %>% filter(PREVAILING_WAGE > lims[1], PREVAILING_WAGE < lims[2])
Visualisations
For simplicity’s sake, a major part of this section the focus is solely on Data Science jobs.
Petitions Status of Data Science Jobs
ds_petitions_status <- ggplot(tidy_jobs, aes(fill=CASE_STATUS)) +
geom_bar(aes(x=JOB_TITLE), position="fill") +
coord_flip() +
theme(legend.position="bottom") +
# guides(fill=guide_legend(nrow=1))
labs(x="Job Title", y="Percentage", title='Petition Status of Data Scientists Jobs')
plot(ds_petitions_status)

Good News! 95 % of all Data Scientist petitions were certified.
Data Scientists’ Salary Distribution
Since the type of position (full-time vs. part-time) influences the salary, we condition the density plot on the position type.
salaryStats = tidy_jobs %>%
filter(JOB_TITLE=="Data Scientist") %>%
group_by(FULL_TIME_POSITION) %>%
summarise(med = median(PREVAILING_WAGE)/1e3)
ds_dens = ggplot(tidy_jobs %>% filter(JOB_TITLE=="Data Scientist")) +
geom_density(aes(x=PREVAILING_WAGE/1e3, color=FULL_TIME_POSITION), size=1.2) +
geom_vline(aes(xintercept=med, color=FULL_TIME_POSITION), salaryStats, size=1.2) +
theme_bw() + theme(legend.position="bottom") +
labs(x="Salary (in thousand USD)", y="Density of Data Science Salary",
title="Data Scientists' Salary Distribution by Position Type",
subtitle="With Salary Medians") +
scale_x_continuous(breaks=seq(40,150,5))
plot(ds_dens)

Data Scientists’ Salary Trend
trends_conditioned = tidy_jobs %>%
filter(JOB_TITLE=="Data Scientist") %>%
group_by(FULL_TIME_POSITION,YEAR) %>%
summarise(med = median(PREVAILING_WAGE)/1e3, N=n())
ds_trend <- ggplot(tidy_jobs %>% filter(JOB_TITLE=="Data Scientist"),
aes(x=YEAR, y=PREVAILING_WAGE/1e3, color=FULL_TIME_POSITION)) +
geom_point(aes(x=YEAR, y=med, color=FULL_TIME_POSITION, size=N), trends_conditioned) +
geom_line(aes(x=YEAR, y=med, color=FULL_TIME_POSITION), trends_conditioned) +
geom_smooth(method="lm", se=FALSE, linetype=9) +
scale_y_continuous(breaks=seq(40,140,5)) +
guides(size=guide_legend(title="Number of Jobs Petitions"),
color=guide_legend(title="Position Type")) +
labs(title="Data Scientists' Salary Trend",
subtitle="with the number of related petitions per year",
x="Year",y="Salary (in thousand USD)") +
theme_bw()
plot(ds_trend)

---
title: "H-1B Visa Petitions 2011-2016"
subtitle: "The Data Scientist Aspect"
author: "Harel Lustiger"
output: 
  html_notebook: 
    highlight: tango
    theme: journal
    toc: yes
---

```{r setup, include=FALSE}
rm(list = ls()); cat("\014")
knitr::opts_chunk$set(fig.width=6, fig.height=4.5, fig.align="center")
```

---

# PART 1: Basic Analysis

## Load the Project's Libraries

```{r load_libraries, message=FALSE, warning=FALSE}
library(tidyverse)
library(plotly)
```

## Get the Data

```{r get_the_data, message=FALSE, warning=FALSE, cache=TRUE}
file_name = file.path(getwd(),"h1b_kaggle.csv.zip")
messy_data = read_csv(file_name, progress=FALSE)
dim(messy_data)
knitr::kable(head(messy_data) %>% select(-X1,-lon,-lat), digits=0)
```

## Tidy the Data

```{r tidy_data}
tidy_data = messy_data
```

### Remove Defective Rows and Useless Columns

```{r remove_cols_and_rows}
# Remove the index column
tidy_data = tidy_data %>% select(-X1)

# Discard rows with NA (overlook the lan and lon variables)
tidy_data = tidy_data %>% drop_na(-lon, -lat)
```

### Convert Variables Types

```{r convert_vars_types}
tidy_data$CASE_STATUS = tidy_data %>% .$CASE_STATUS %>% as.factor()
tidy_data$EMPLOYER_NAME = tidy_data %>% .$EMPLOYER_NAME %>% as.factor()
tidy_data$SOC_NAME = tidy_data %>% .$SOC_NAME %>% as.factor()
tidy_data$JOB_TITLE = tidy_data %>% .$JOB_TITLE %>% as.factor()
tidy_data$FULL_TIME_POSITION = tidy_data %>% .$FULL_TIME_POSITION %>% as.factor()
tidy_data$WORKSITE = tidy_data %>% .$WORKSITE %>% as.factor()
```

### Make Factor Levels Insensitive for Capitalization

e.g. **Economists = economists** are assigned as the same factor level 

```{r make facto_levels_insensitive_for_capitalization}
tidy_data = tidy_data %>% 
    map_if(is.factor, fct_relabel, fun=function(x) {x=str_to_title(x)}) %>% 
    as_tibble()
```


# PART 2: Exploratory Data Analysis

## Data Scientist Data Set Preprocessing

### Filter Data Science Jobs

According to a recent survey in the data analytics domain, this is how 
practitioners call themselves:

* Data Scientist (31%)
* Data Analyst (12%)
* Researcher (11%)
* Business Analyst (9%)
* Statistician (6%)
* Predictive Modeler (5%)
* Other Job Titles (26%)

```{r filter_jobs}
jobs_titles = c('Data Scientist',
                'Data Analyst',
                'Business Analyst',
                'Statistician',
                'Predictive Modeler')
# Convert the job title vector into one regular expression query
jobs_pattern = jobs_titles %>% sapply(function(x) paste0("^",x,"*")) %>% paste(collapse='|')
tidy_jobs = tidy_data %>% filter(str_detect(JOB_TITLE,jobs_pattern)) 
# Drop unused facotor levels
tidy_jobs$JOB_TITLE = tidy_jobs %>% .[["JOB_TITLE"]] %>% fct_drop()
```

```{r filter_jobs_statistics_1, echo=FALSE}
n_levels = tidy_jobs %>% .[["JOB_TITLE"]] %>% nlevels
n_rows = nrow(tidy_jobs)
cat("There are",n_levels,"matching job titles and",n_rows,"subsequent petitions")
```

Collapse the different job title variations into their basic form of
`r length(jobs_titles)` titles:   
`r jobs_titles`

```{r collapse_factor_levels_into_theme_groups}
for(jobs_title in jobs_titles) {
  # Find the rows which correspond the the current job title 
  rows_index = tidy_jobs %>% .$JOB_TITLE %>% str_detect(paste0("^",jobs_title,"*"))
  # Replace the different title variations with their stem
  tidy_jobs[rows_index,"JOB_TITLE"] = jobs_title
} 
# Drop unused facotor levels
tidy_jobs$JOB_TITLE = tidy_jobs %>% .[["JOB_TITLE"]] %>% fct_drop()
```

```{r filter_jobs_statistics_2, echo=FALSE, results="as.is"}
n_levels = tidy_jobs %>% .[["JOB_TITLE"]] %>% nlevels
n_rows = nrow(tidy_jobs)
cat("There are",n_levels,"matching job titles and",n_rows,"subsequent petitions")

knitr::kable(
    tidy_jobs %>% group_by(JOB_TITLE) %>% summarise(N=n(), Percent=round(100*N/nrow(tidy_jobs))) %>% arrange(desc(N)),
    col.names=c("Job Title","Occurrences in the Dataset","Percentage of the Dataset"),
    format="markdown", caption="Job Title Summary Statistics")
```

### Remove Salary's Outliers

Find the 2.5% and 97.5% quantiles and consider only records within these 
borders.

```{r remove_salary_outliers}
lims = tidy_jobs %>% .$PREVAILING_WAGE %>% quantile(c(2.5,97.5)/100)
tidy_jobs = tidy_jobs %>% filter(PREVAILING_WAGE > lims[1], PREVAILING_WAGE < lims[2])
```


## Visualisations

For simplicity's sake, a major part of this section the focus is solely on 
**Data Science** jobs.

### Petitions Status of Data Science Jobs

```{r data_scientists_petitions_status, fig.asp=.5}
ds_petitions_status <- ggplot(tidy_jobs, aes(fill=CASE_STATUS)) +
    geom_bar(aes(x=JOB_TITLE), position="fill") +
    coord_flip() + 
    theme(legend.position="bottom") +
    # guides(fill=guide_legend(nrow=1))
    labs(x="Job Title", y="Percentage", title='Petition Status of Data Scientists Jobs')
  
plot(ds_petitions_status)
```

```{r data_scientists_petitions_stats, echo=FALSE}
pet_stats = tidy_jobs %>% filter(JOB_TITLE=="Data Scientist") %>% 
    group_by(CASE_STATUS) %>% summarize(STATUS = n())
certified_per = sum(pet_stats %>%
                        filter(CASE_STATUS %in% c("Certified","Certified-Withdrawn")) %>%
                        select(STATUS)) / sum(pet_stats %>% select(STATUS))
```

Good News! `r round(100*certified_per)` % of all Data Scientist petitions were 
certified.

<!--
What is the total % of certified + certified-withdrawn in this vocation
-->

### Data Scientists' Salary Distribution

Since the type of position (full-time vs. part-time) influences the salary, we 
condition the density plot on the position type. 

```{r data_scientists_salary_distribution}
salaryStats = tidy_jobs %>% 
    filter(JOB_TITLE=="Data Scientist") %>% 
    group_by(FULL_TIME_POSITION) %>%
    summarise(med = median(PREVAILING_WAGE)/1e3)
  
ds_dens = ggplot(tidy_jobs %>% filter(JOB_TITLE=="Data Scientist")) +
    geom_density(aes(x=PREVAILING_WAGE/1e3, color=FULL_TIME_POSITION), size=1.2) + 
    geom_vline(aes(xintercept=med, color=FULL_TIME_POSITION), salaryStats, size=1.2) + 
    theme_bw() + theme(legend.position="bottom") +
    labs(x="Salary (in thousand USD)", y="Density of Data Science Salary",
         title="Data Scientists' Salary Distribution by Position Type",
         subtitle="With Salary Medians") + 
    scale_x_continuous(breaks=seq(40,150,5))
  
plot(ds_dens)
```

<!-- ### Data Scientists' Jobs Petitions -->

```{r job_petitions, eval=FALSE, include=FALSE}
salaryStats = tidy_jobs %>% 
    filter(JOB_TITLE=="Data Scientist") %>% 
    group_by(FULL_TIME_POSITION,YEAR) %>%
    summarise(N=n())

ds_petitions <- ggplot() +
    geom_line(aes(x=YEAR, y=N, color=FULL_TIME_POSITION), salaryStats)

plot(ds_petitions)
```

### Data Scientists' Salary Trend

```{r salary_trends}
trends_conditioned = tidy_jobs %>% 
    filter(JOB_TITLE=="Data Scientist") %>% 
    group_by(FULL_TIME_POSITION,YEAR) %>%
    summarise(med = median(PREVAILING_WAGE)/1e3, N=n()) 
  
ds_trend <- ggplot(tidy_jobs %>% filter(JOB_TITLE=="Data Scientist"),
                   aes(x=YEAR, y=PREVAILING_WAGE/1e3, color=FULL_TIME_POSITION)) +
    geom_point(aes(x=YEAR, y=med, color=FULL_TIME_POSITION, size=N), trends_conditioned) +
    geom_line(aes(x=YEAR, y=med, color=FULL_TIME_POSITION), trends_conditioned) +
    geom_smooth(method="lm", se=FALSE, linetype=9) +
    scale_y_continuous(breaks=seq(40,140,5)) +
    guides(size=guide_legend(title="Number of Jobs Petitions"),
           color=guide_legend(title="Position Type")) +
    labs(title="Data Scientists' Salary Trend", 
         subtitle="with the number of related petitions per year",
         x="Year",y="Salary (in thousand USD)") +
    theme_bw()
   
plot(ds_trend)
```

