This report provides an analysis of salaries of data science related jobs using various regression models. The data for this research is based from kaggle and here is the link: click here
The project is structured as follows :
Data Understanding
Data Visualisation and Exploratory Data Analysis
Data Preprocessing
Modeling
Performance Evaluations
rm(list=ls())
library(ggplot2)
library(plyr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
Creating the base dataframe by converting the data
df <- read.csv("ds_salaries.csv")
The dataset contains 607 rows and 12 columns. The target variable is Salary_in_USD and the remains are the features.
Finding out about some of the data
View(head(df))
Finding out the whole data
View(df)
Finding out more with data summarization
View(summary(df))
Finding the one with the highest salary_in_usd
match(600000, df$salary_in_usd)
## [1] 253
Convertion of the raw data to grasp a better understanding and factoring the categorical datas
df$remote_ratio <- factor(df$remote_ratio, levels = c(0,50,100),
labels = c("No Remote Work", "Partially Remote", "Fully Remote"))
df$experience_level <- factor(df$experience_level, levels = c("EN", "MI", "SE", "EX"),
labels = c("Entry Level", "Junior Mid", "Senior", "Executive"))
df$employment_type <- factor(df$employment_type, levels = c("PT", "CT", "FL", "FT"),
labels = c("Part-time", "Contract", "Freelance", "Full-time"))
df$company_size <- factor(df$company_size,
levels = c("S", "M", "L"),
labels = c("Small", "Medium", "Large"))
df$work_year <- factor(df$work_year)
Visualization of salary_in_usd with Boxplot
ggplot(df,
aes(y = salary_in_usd)) + geom_boxplot() +
scale_y_continuous(breaks = c(0, 100000, 200000, 300000,
400000, 500000, 600000),
labels = c("0", "100000", "200000",
"300000", "400000",
"500000", "600000"),
limits= c(0, 600000))
Visualization of salary_in_usd and job_title with Histogram
ggplot(df,
aes(y = salary_in_usd,
fill = job_title,
group = job_title
)) + geom_histogram(bins = 25) +
scale_y_continuous(limits=c(0,1000000)) +
labs(title = "Positions Matter",y = "Salary in USD")
Visualization of salary_in_usd and experience_level with Histogram
ggplot(df,
aes(y = salary_in_usd,
fill = experience_level,
group = experience_level
)) + geom_histogram(bins = 20) +
scale_y_continuous(limits=c(0,1000000)) +
labs(y = "Salary in USD") +
labs(title = "Experience Matters",y = "Salary in USD")
Visualization of salary_in_usd and company_size with Histogram
ggplot(df,
aes(y = salary_in_usd,
fill = company_size,
group = company_size,
)) + geom_histogram(bins = 20) +
scale_y_continuous(limits=c(0,1000000)) +
labs(y = "Salary in USD") +
labs(title = "Workplace Matters",y = "Salary in USD")
Visualization of salary_in_usd and remote_ratio with Scatter Plot
ggplot(data = df,
aes(y = salary_in_usd,
x = remote_ratio
)) + geom_point(color = "cyan",
position = "jitter") +
scale_y_continuous(limits=c(0,1000000)) +
labs(y = "Salary in USD") +
labs(title = "Work From Home Salaries",y = "Salary in USD")
Visualization of salary_in_usd and employment_type with Density Plot
ggplot(data = df,
aes(y = salary_in_usd,
fill = employment_type,
group = employment_type
)) + geom_density(color = "yellow" ) +
scale_y_continuous(limits=c(0,1000000)) +
labs(y = "Salary in USD") +
labs(title = "Working Hours Matter",y = "Salary in USD")
Visualization of salary_in_usd and company_location with Histogram
ggplot(data = df,
aes(x = salary_in_usd,
fill = company_location,
group = company_location
)) + geom_histogram(bins=30) +
scale_x_continuous(limits=c(0,1000000)) +
labs(title = "Working Locations",x = "Salary in USD")
ggplot(data = df,
aes(y = salary_in_usd)) + geom_boxplot()
boxplot.stats(df$salary_in_usd)
## $stats
## [1] 2859 62726 101570 150000 276000
##
## $n
## [1] 607
##
## $conf
## [1] 95973.1 107166.9
##
## $out
## [1] 325000 450000 412000 450000 423000 416000 600000 324000 380000 405000
outlier <- boxplot.stats(df$salary_in_usd)$out
outlier_idx <- which(df$salary_in_usd %in% c(outlier))
df <- df[-outlier_idx]
le_experience <- function(features){
value=0
if(features == "Entry Level"){
value = 1
}else if(features == "Junior Mid"){
value = 2
}else if(features == "Senior"){
value = 3
}else{
value = 4
}
return(value)
}
le_test <- sapply(df$experience_level, le_experience)
le_test
## [1] 2 3 3 2 3 1 3 2 2 3 1 2 1 2 2 2 1 3 1 2 2 2 3 2 2 4 1 3 1 3 2 1 3 2 2 2 2
## [38] 1 1 1 2 4 2 2 2 1 2 3 2 2 1 1 1 1 3 3 2 2 3 2 2 2 1 3 3 1 1 3 1 3 2 2 1 4
## [75] 4 3 2 2 2 1 3 2 2 2 4 2 1 2 3 3 3 1 2 3 1 2 1 2 1 2 2 3 2 2 2 2 2 3 3 1 3
## [112] 3 3 1 2 1 2 2 1 2 2 3 1 1 1 2 3 2 1 3 1 1 2 3 1 2 2 2 3 1 2 3 3 2 2 3 2 2
## [149] 3 3 3 2 2 1 3 3 2 2 3 1 4 4 2 1 4 3 1 4 1 2 2 2 1 3 3 3 2 2 1 2 2 2 2 3 2
## [186] 2 3 4 3 2 3 1 2 3 3 2 1 3 3 1 2 3 2 3 2 2 3 3 2 3 2 2 2 1 1 3 1 2 2 3 2 2
## [223] 2 2 3 4 3 2 3 3 1 3 3 3 2 2 2 2 1 1 3 2 2 3 1 2 1 2 3 3 2 1 4 1 2 3 2 3 3
## [260] 4 2 3 2 3 2 3 2 2 2 1 1 3 1 1 3 1 1 3 3 1 2 1 2 3 2 3 3 2 2 3 3 3 2 2 2 2
## [297] 3 3 3 3 3 3 3 3 1 3 3 2 2 4 4 2 2 2 2 3 1 3 3 3 3 3 3 3 3 3 4 4 3 2 3 3 3
## [334] 3 3 3 2 3 3 3 3 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 4 3
## [371] 3 3 3 2 2 4 3 3 3 3 3 3 3 3 4 3 1 3 3 2 2 2 3 3 3 3 2 2 3 3 3 3 3 3 3 2 2
## [408] 3 2 3 2 2 2 2 2 2 3 3 2 2 2 2 2 3 3 2 3 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 2 2
## [445] 3 2 3 3 3 1 3 2 4 2 1 2 3 3 2 2 2 1 2 1 3 1 3 3 3 3 2 2 3 3 2 2 3 3 2 2 3
## [482] 3 4 4 3 3 3 1 2 1 3 2 2 3 3 2 1 3 3 1 3 2 1 2 3 1 2 2 1 2 1 2 1 3 1 2 3 2
## [519] 2 3 2 1 2 3 2 3 2 3 3 3 2 2 3 3 3 3 3 3 2 2 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3
## [556] 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3
## [593] 3 3 3 3 3 3 2 2 1 1 3 3 3 3 2
df <- data.frame(df, le_test)
# View(df)
df$experience_level <- NULL
colnames(df) <- c("work_year", "employment_type", "job_title",
"salary", "salary_currency", "salary_in_usd",
"employee_residence", "remote_ratio",
"company_location", "company_size",
"experience_level")
le_company_size <- function(features){
value=0
if(features == "Small"){
value = 1
}else if(features == "Medium"){
value = 2
}else if(features == "Large"){
value = 3
}
return(value)
}
le_test <- sapply(df$company_size, le_company_size)
le_test
## DE JP GB HN US US US HU US NZ FR IN FR US US PK JP GB IN US CN IN GR US AE US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## NL MX US CA DE US US US FR AT US US NG US US ES PT US GB DE GB US US FR IN US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## DK DE US DE ES US US US US US IT US HR DE DE US US AT LU FR GB US US FR US IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US DE US CA ES PL PL FR US US US DK DE IN US IN SG US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US GB CA US US IN DE GB GB US NL US NG GR US US RO US US GB ES US US IN IN IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## IQ FR US BR US US JP JP US US US US US US US BE FR US US US JP US CA UA US CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## IL US US US RU RU MT DE DE US US US US US US GB US US PT US MX CL US US IN DE
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US GB PK IR GB ES GB JP FR CO MD US CA US KE IN US US AT US ES US US BR US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US SI FR GB CH DE US DK DE BE US PL GB IN GB CA US CA DE US CA IN US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US CA ES VN IN CA US US US AS GB FR TR GB US US US US IN US CA US DE US DE
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US DE IN IN DE US US US TR DE US BR DE DE FR US US ES TR LU US CN NL US CZ IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## SI US IT US US US US US US US US US US US GB GB US US GB US US US US US US GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB GB GB US GB US US US US US US US US US US US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US US US GB GB US US CA CA CA CA CA CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US GR GR CA US US US US US US US US IN US GB US US GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB US US US US US DE GB US US US US US US US GB US US GB US GB GB GR GR GB GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US MX US US US US US US US US US ES US GB ES ES ES ES GB GB ES GR US US GR GR
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB GB US GR US US US PT US CA CA US US US US DE IN IN PT US DE IN DE US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US GB GB US US US US AE AE US US US US US DZ US CZ US CA PL CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US DE US FR CA NL EE MY AU US AU US AT US US AU CA US IE PK US US FR CH US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CA LU GR US US US US US US US CA CA US US US US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US US US US US US US US GB US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US GB US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US CA CA US US US US US
## 0 0 0 0 0 0 0 0 0
df <- data.frame(df, le_test)
df$company_size <- NULL
le_remote <- function(features){
value=0
if(features == "No Remote Work"){
value = 0
}else if(features == "Partially Remote"){
value = 50
}else if(features == "Fully Remote"){
value = 100
}
return(value)
}
le_test <- sapply(df$remote_ratio, le_remote)
le_test
## DE JP GB HN US US US HU US NZ FR IN FR US US PK JP PL IN PT CN IN GR US AE US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## NL MX US CA DE US GR US FR AT US US NG US PH ES PT US GB DE GB US US FR IN US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## DK PK RU DE ES US US US US ES IT US HR DE DE US US AT FR FR GB IN US FR US IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US DE US GB ES IT PL FR US US BG GR DE IN US IN SG BR US US DE US US HU US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US GB CA US US IN DE GB GB PK NL US NG GR US US ES US US GB ES US US IN IN IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## IQ FR VN BR US US JP JP US US US US US US US BE FR US US BR JP US CA UA US CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## SG US US US RU RU MT DE DE US US US US US US GB US US PT US MX CL US IN IN DE
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## RO GB PK IR VN FR RO JP US CO MD US CA US KE IN IN US DE US ES FR US BR US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## IT US SI FR HK IN DE US DK DE BE US PL GB IN GB US US CA DE US CA IN US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US CA ES VN IN CA US US US IN GB FR TR GB US US US US IN US CA US DE US DE
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US DE IN IN RS PR US US TR DE US BR DE NL FR US US ES TR LU US JE NL US CZ IN
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## SI US IT US US US US US US US US US US US GB GB US US GB US US US US US US GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB GB GB US GB US US US US US US US US US US US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US US US GB GB US US CA CA CA CA CA CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US GR GR CA US US US US US US US US IN US GB US US GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB US US US US US FR GB US US US US US US US GB US US GB US GB GB GR GR GB GB
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US AR US US US US US US US US US ES US GB ES ES ES ES GB GB ES GR US US GR GR
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GB GB US GR US US US PT US CA CA US US US US DE IN IN PT US DE IN DE US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US GB GB US US US US AE AE US US US US US DZ CA TN MY CA PL CA
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## BR US PK US FR CA NL EE JP AU US AU BO AT US US AU CA US IE PK RU US FR CH US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CA PT GR US US US US US US US CA CA US US US US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US US US US US US US US GB US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US US US US US US US US US US US US US GB US US US US US US US US US US US
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## US US CA CA US US US US IN
## 0 0 0 0 0 0 0 0 0
df <- data.frame(df, le_test)
df$remote_ratio <- NULL
# colnames(df) <- c("work_year", "employment_type", "job_title",
# "salary", "salary_currency", "salary_in_usd",
# "employee_residence",
# "company_location",
# "experience_level", "company_size",
# "remote_ratio")
df_OHE <- dummyVars("~.",
data = df,
fullrank = T)
df_new <- data.frame(predict(df_OHE,
newdata = df))
Viewing the preprocessed data
View(head(df_new))
This is the best proportion that I tried (93% Training, 7% Testing)
# set.seed(2000)
#
# row = dim(df_new)[1]
#
# train_idx = sample(row, 0.93*row)
#
# training_data <- df_new[train_idx,]
# testing_data <- df_new[-train_idx,]
# actual <- testing_data$salary_in_usd
Creating several regression models based on the data
# slr <- lm(formula = salary_in_usd~experience_level,
# data = training_data)
#
# pred.slr <- predict(slr, testing_data)
# mlr <- lm(formula = salary_in_usd~.,
# data = training_data)
#
# pred.mlr <- predict(mlr, testing_data)
# poly <- lm(formula = salary_in_usd~. + I(company_size^2) +
# I(experience_level^3),
# data = training_data)
#
# pred.poly <-predict(poly, testing_data)
performance <- function(prediction, actual, method){
error <- prediction - actual
squared_error <- error^2
sum_squared_error <- sum(squared_error)
mean_squared_error <- mean(squared_error)
root_mean_squared_error <- sqrt(mean_squared_error)
r <- cor(prediction, actual)
result <- paste("Method:", method,
"\nSum Squared Error:", round(sum_squared_error,2),
"\nMean Squared Error:", round(mean_squared_error,2),
"\nRoot Mean Squared Error:", round(root_mean_squared_error,2),
"\nCorrelation:", round(r,2))
cat(result)
}
# performance(pred.slr, actual, "Simple Linear Regression")
# performance(pred.mlr, actual, "Multiple Linear Regression")
# performance(pred.poly, actual, "Polynomial Regression")
From here we know that (from this data):
-Simple Linear Regression has the lowest correlation (0.55) and the highest errors
-Multiple Linear Regression has the highest correlation (0.71) and the least errors
-Polynomial Regression has the second highest correlation (0.71) and the second least errors