Description

This report provides an analysis of salaries of data science related jobs using various regression models. The data for this research is based from kaggle and here is the link: click here

The project is structured as follows :

  1. Data Understanding

  2. Data Visualisation and Exploratory Data Analysis

  3. Data Preprocessing

  4. Modeling

  5. Performance Evaluations

1. Data Understanding

rm(list=ls())
library(ggplot2)
library(plyr)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice

Creating the base dataframe by converting the data

df <- read.csv("ds_salaries.csv")

The dataset contains 607 rows and 12 columns. The target variable is Salary_in_USD and the remains are the features.

Finding out about some of the data

View(head(df))

Finding out the whole data

View(df)

Finding out more with data summarization

View(summary(df))

Finding the one with the highest salary_in_usd

match(600000, df$salary_in_usd)
## [1] 253

2.Exploratory Data Analysis

2.0 Prerequisites before visualizing the data

Convertion of the raw data to grasp a better understanding and factoring the categorical datas

df$remote_ratio <- factor(df$remote_ratio, levels = c(0,50,100),
labels = c("No Remote Work", "Partially Remote", "Fully Remote"))

df$experience_level <- factor(df$experience_level, levels = c("EN", "MI", "SE", "EX"),
                          labels = c("Entry Level", "Junior Mid", "Senior", "Executive"))

df$employment_type <- factor(df$employment_type, levels = c("PT", "CT", "FL", "FT"),
                              labels = c("Part-time", "Contract", "Freelance", "Full-time"))
df$company_size <- factor(df$company_size, 
                          levels = c("S", "M", "L"),
                          labels = c("Small", "Medium", "Large"))
df$work_year <- factor(df$work_year)

2.1 Univariate Data Analysis

Visualization of salary_in_usd with Boxplot

ggplot(df,
       aes(y = salary_in_usd)) + geom_boxplot() +
  scale_y_continuous(breaks = c(0, 100000, 200000, 300000,
                                400000, 500000, 600000),
                     labels = c("0", "100000", "200000",
                                "300000", "400000",
                                "500000", "600000"),
                     limits= c(0, 600000))

2.2 Bivariate Data Analysis

Visualization of salary_in_usd and job_title with Histogram

ggplot(df,
       aes(y = salary_in_usd,
           fill = job_title,
           group = job_title
           )) + geom_histogram(bins = 25) +
  scale_y_continuous(limits=c(0,1000000)) +
  labs(title = "Positions Matter",y = "Salary in USD")

Visualization of salary_in_usd and experience_level with Histogram

ggplot(df,
       aes(y = salary_in_usd,
           fill = experience_level,
           group = experience_level
       )) + geom_histogram(bins = 20) +
  scale_y_continuous(limits=c(0,1000000)) +
  labs(y = "Salary in USD") +
  labs(title = "Experience Matters",y = "Salary in USD")

Visualization of salary_in_usd and company_size with Histogram

ggplot(df,
       aes(y = salary_in_usd,
           fill = company_size,
           group = company_size,
       )) + geom_histogram(bins = 20) +
  scale_y_continuous(limits=c(0,1000000)) +
  labs(y = "Salary in USD") +
  labs(title = "Workplace Matters",y = "Salary in USD")

Visualization of salary_in_usd and remote_ratio with Scatter Plot

ggplot(data = df,
       aes(y = salary_in_usd,
           x = remote_ratio
       )) + geom_point(color = "cyan", 
                     position = "jitter") +
  scale_y_continuous(limits=c(0,1000000)) +
  labs(y = "Salary in USD") +
  labs(title = "Work From Home Salaries",y = "Salary in USD")

Visualization of salary_in_usd and employment_type with Density Plot

ggplot(data = df,
       aes(y = salary_in_usd,
           fill = employment_type,
           group = employment_type
       )) + geom_density(color = "yellow" ) +
  scale_y_continuous(limits=c(0,1000000)) +
  labs(y = "Salary in USD") +
  labs(title = "Working Hours Matter",y = "Salary in USD")

Visualization of salary_in_usd and company_location with Histogram

ggplot(data = df,
       aes(x = salary_in_usd,
           fill = company_location,
           group = company_location
       )) + geom_histogram(bins=30) +
  scale_x_continuous(limits=c(0,1000000)) +
  labs(title = "Working Locations",x = "Salary in USD")

3.Data Preprocessing

3.1 Outlier Handling (removing the outliers)

ggplot(data = df,
       aes(y = salary_in_usd)) + geom_boxplot()

boxplot.stats(df$salary_in_usd)
## $stats
## [1]   2859  62726 101570 150000 276000
## 
## $n
## [1] 607
## 
## $conf
## [1]  95973.1 107166.9
## 
## $out
##  [1] 325000 450000 412000 450000 423000 416000 600000 324000 380000 405000
outlier <- boxplot.stats(df$salary_in_usd)$out

outlier_idx <- which(df$salary_in_usd %in% c(outlier))

df <- df[-outlier_idx]

3.2 Label Encoding

le_experience <- function(features){
  value=0
  if(features == "Entry Level"){
    value = 1
  }else if(features == "Junior Mid"){
    value = 2
  }else if(features == "Senior"){
    value = 3
  }else{
    value = 4
  }
  
  return(value)
  
}

le_test <- sapply(df$experience_level, le_experience)
le_test
##   [1] 2 3 3 2 3 1 3 2 2 3 1 2 1 2 2 2 1 3 1 2 2 2 3 2 2 4 1 3 1 3 2 1 3 2 2 2 2
##  [38] 1 1 1 2 4 2 2 2 1 2 3 2 2 1 1 1 1 3 3 2 2 3 2 2 2 1 3 3 1 1 3 1 3 2 2 1 4
##  [75] 4 3 2 2 2 1 3 2 2 2 4 2 1 2 3 3 3 1 2 3 1 2 1 2 1 2 2 3 2 2 2 2 2 3 3 1 3
## [112] 3 3 1 2 1 2 2 1 2 2 3 1 1 1 2 3 2 1 3 1 1 2 3 1 2 2 2 3 1 2 3 3 2 2 3 2 2
## [149] 3 3 3 2 2 1 3 3 2 2 3 1 4 4 2 1 4 3 1 4 1 2 2 2 1 3 3 3 2 2 1 2 2 2 2 3 2
## [186] 2 3 4 3 2 3 1 2 3 3 2 1 3 3 1 2 3 2 3 2 2 3 3 2 3 2 2 2 1 1 3 1 2 2 3 2 2
## [223] 2 2 3 4 3 2 3 3 1 3 3 3 2 2 2 2 1 1 3 2 2 3 1 2 1 2 3 3 2 1 4 1 2 3 2 3 3
## [260] 4 2 3 2 3 2 3 2 2 2 1 1 3 1 1 3 1 1 3 3 1 2 1 2 3 2 3 3 2 2 3 3 3 2 2 2 2
## [297] 3 3 3 3 3 3 3 3 1 3 3 2 2 4 4 2 2 2 2 3 1 3 3 3 3 3 3 3 3 3 4 4 3 2 3 3 3
## [334] 3 3 3 2 3 3 3 3 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 4 3
## [371] 3 3 3 2 2 4 3 3 3 3 3 3 3 3 4 3 1 3 3 2 2 2 3 3 3 3 2 2 3 3 3 3 3 3 3 2 2
## [408] 3 2 3 2 2 2 2 2 2 3 3 2 2 2 2 2 3 3 2 3 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 2 2
## [445] 3 2 3 3 3 1 3 2 4 2 1 2 3 3 2 2 2 1 2 1 3 1 3 3 3 3 2 2 3 3 2 2 3 3 2 2 3
## [482] 3 4 4 3 3 3 1 2 1 3 2 2 3 3 2 1 3 3 1 3 2 1 2 3 1 2 2 1 2 1 2 1 3 1 2 3 2
## [519] 2 3 2 1 2 3 2 3 2 3 3 3 2 2 3 3 3 3 3 3 2 2 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3
## [556] 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3
## [593] 3 3 3 3 3 3 2 2 1 1 3 3 3 3 2
df <- data.frame(df, le_test)
# View(df)

df$experience_level <- NULL
colnames(df) <- c("work_year", "employment_type", "job_title",
                  "salary", "salary_currency", "salary_in_usd",
                  "employee_residence", "remote_ratio", 
                  "company_location", "company_size",
                  "experience_level")

le_company_size <- function(features){
  value=0
  if(features == "Small"){
    value = 1
  }else if(features == "Medium"){
    value = 2
  }else if(features == "Large"){
    value = 3
  }
  
  return(value)
  
}

le_test <- sapply(df$company_size, le_company_size)
le_test
## DE JP GB HN US US US HU US NZ FR IN FR US US PK JP GB IN US CN IN GR US AE US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## NL MX US CA DE US US US FR AT US US NG US US ES PT US GB DE GB US US FR IN US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## DK DE US DE ES US US US US US IT US HR DE DE US US AT LU FR GB US US FR US IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US DE US CA ES PL PL FR US US US DK DE IN US IN SG US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US GB CA US US IN DE GB GB US NL US NG GR US US RO US US GB ES US US IN IN IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## IQ FR US BR US US JP JP US US US US US US US BE FR US US US JP US CA UA US CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## IL US US US RU RU MT DE DE US US US US US US GB US US PT US MX CL US US IN DE 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US GB PK IR GB ES GB JP FR CO MD US CA US KE IN US US AT US ES US US BR US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US SI FR GB CH DE US DK DE BE US PL GB IN GB CA US CA DE US CA IN US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US CA ES VN IN CA US US US AS GB FR TR GB US US US US IN US CA US DE US DE 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US DE IN IN DE US US US TR DE US BR DE DE FR US US ES TR LU US CN NL US CZ IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## SI US IT US US US US US US US US US US US GB GB US US GB US US US US US US GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB GB GB US GB US US US US US US US US US US US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US US US GB GB US US CA CA CA CA CA CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US GR GR CA US US US US US US US US IN US GB US US GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB US US US US US DE GB US US US US US US US GB US US GB US GB GB GR GR GB GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US MX US US US US US US US US US ES US GB ES ES ES ES GB GB ES GR US US GR GR 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB GB US GR US US US PT US CA CA US US US US DE IN IN PT US DE IN DE US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US GB GB US US US US AE AE US US US US US DZ US CZ US CA PL CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US DE US FR CA NL EE MY AU US AU US AT US US AU CA US IE PK US US FR CH US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## CA LU GR US US US US US US US CA CA US US US US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US US US US US US US US GB US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US GB US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US CA CA US US US US US 
##  0  0  0  0  0  0  0  0  0
df <- data.frame(df, le_test)


df$company_size <- NULL

le_remote <- function(features){
  value=0
  if(features == "No Remote Work"){
    value = 0
  }else if(features == "Partially Remote"){
    value = 50
  }else if(features == "Fully Remote"){
    value = 100
  }
  
  return(value)
  
}

le_test <- sapply(df$remote_ratio, le_remote)
le_test
## DE JP GB HN US US US HU US NZ FR IN FR US US PK JP PL IN PT CN IN GR US AE US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## NL MX US CA DE US GR US FR AT US US NG US PH ES PT US GB DE GB US US FR IN US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## DK PK RU DE ES US US US US ES IT US HR DE DE US US AT FR FR GB IN US FR US IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US DE US GB ES IT PL FR US US BG GR DE IN US IN SG BR US US DE US US HU US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US GB CA US US IN DE GB GB PK NL US NG GR US US ES US US GB ES US US IN IN IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## IQ FR VN BR US US JP JP US US US US US US US BE FR US US BR JP US CA UA US CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## SG US US US RU RU MT DE DE US US US US US US GB US US PT US MX CL US IN IN DE 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## RO GB PK IR VN FR RO JP US CO MD US CA US KE IN IN US DE US ES FR US BR US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## IT US SI FR HK IN DE US DK DE BE US PL GB IN GB US US CA DE US CA IN US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US CA ES VN IN CA US US US IN GB FR TR GB US US US US IN US CA US DE US DE 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US DE IN IN RS PR US US TR DE US BR DE NL FR US US ES TR LU US JE NL US CZ IN 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## SI US IT US US US US US US US US US US US GB GB US US GB US US US US US US GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB GB GB US GB US US US US US US US US US US US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US US US GB GB US US CA CA CA CA CA CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US GR GR CA US US US US US US US US IN US GB US US GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB US US US US US FR GB US US US US US US US GB US US GB US GB GB GR GR GB GB 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US AR US US US US US US US US US ES US GB ES ES ES ES GB GB ES GR US US GR GR 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## GB GB US GR US US US PT US CA CA US US US US DE IN IN PT US DE IN DE US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US GB GB US US US US AE AE US US US US US DZ CA TN MY CA PL CA 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## BR US PK US FR CA NL EE JP AU US AU BO AT US US AU CA US IE PK RU US FR CH US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## CA PT GR US US US US US US US CA CA US US US US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US US US US US US US US GB US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US US US US US US US US US US US US US GB US US US US US US US US US US US 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
## US US CA CA US US US US IN 
##  0  0  0  0  0  0  0  0  0
df <- data.frame(df, le_test)


df$remote_ratio <- NULL


# colnames(df) <- c("work_year", "employment_type", "job_title",
#                   "salary", "salary_currency", "salary_in_usd",
#                   "employee_residence", 
#                   "company_location",
#                   "experience_level", "company_size",
#                   "remote_ratio")

3.3 One Hot Encoding

df_OHE <- dummyVars("~.", 
                    data = df,
                    fullrank = T)

df_new <- data.frame(predict(df_OHE,
                             newdata = df))

Viewing the preprocessed data

View(head(df_new))

3.4 Training and Testing Split

This is the best proportion that I tried (93% Training, 7% Testing)

# set.seed(2000)
# 
# row = dim(df_new)[1]
# 
# train_idx = sample(row, 0.93*row)
# 
# training_data <- df_new[train_idx,]
# testing_data <- df_new[-train_idx,]
# actual <- testing_data$salary_in_usd

4. Modeling

Creating several regression models based on the data

4.1 Simple Linear Regression

# slr <- lm(formula = salary_in_usd~experience_level,
#           data = training_data)
# 
# pred.slr <- predict(slr, testing_data)

4.2 Multiple Linear Regression

# mlr <- lm(formula = salary_in_usd~.,
#           data = training_data)
# 
# pred.mlr <- predict(mlr, testing_data)

4.3 Polynomial Regression

# poly <- lm(formula = salary_in_usd~. + I(company_size^2) + 
#              I(experience_level^3),
#            data = training_data)
# 
# pred.poly <-predict(poly, testing_data)

5. Performance Evaluations

5.1 Creating the function for Performance

performance <- function(prediction, actual, method){
  error <- prediction - actual
  squared_error <- error^2
  sum_squared_error <- sum(squared_error)
  mean_squared_error <- mean(squared_error)
  root_mean_squared_error <- sqrt(mean_squared_error)
  
  r <- cor(prediction, actual)
  
  result <- paste("Method:", method,
                  "\nSum Squared Error:", round(sum_squared_error,2),
                  "\nMean Squared Error:", round(mean_squared_error,2),
                  "\nRoot Mean Squared Error:", round(root_mean_squared_error,2),
                  "\nCorrelation:", round(r,2))
  
  cat(result)
}

5.2 Performance Evaluations

# performance(pred.slr, actual, "Simple Linear Regression")
# performance(pred.mlr, actual, "Multiple Linear Regression")
# performance(pred.poly, actual, "Polynomial Regression")

From here we know that (from this data):

-Simple Linear Regression has the lowest correlation (0.55) and the highest errors

-Multiple Linear Regression has the highest correlation (0.71) and the least errors

-Polynomial Regression has the second highest correlation (0.71) and the second least errors

Thank you for reading and using this code