Import the data
#load necessary libraries)
library(readr)
library(dplyr)
library(psych)
#read the data
data = read_csv("ds_salaries.csv")
head(data)
## # A tibble: 6 × 11
## work_year experience_level employment_type job_title salary salary_currency
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 2023 SE FT Principal D… 80000 EUR
## 2 2023 MI CT ML Engineer 30000 USD
## 3 2023 MI CT ML Engineer 25500 USD
## 4 2023 SE FT Data Scient… 175000 USD
## 5 2023 SE FT Data Scient… 120000 USD
## 6 2023 SE FT Applied Sci… 222200 USD
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
Overview of the Data
summary(data)
## work_year experience_level employment_type job_title
## Min. :2020 Length:3755 Length:3755 Length:3755
## 1st Qu.:2022 Class :character Class :character Class :character
## Median :2022 Mode :character Mode :character Mode :character
## Mean :2022
## 3rd Qu.:2023
## Max. :2023
## salary salary_currency salary_in_usd employee_residence
## Min. : 6000 Length:3755 Min. : 5132 Length:3755
## 1st Qu.: 100000 Class :character 1st Qu.: 95000 Class :character
## Median : 138000 Mode :character Median :135000 Mode :character
## Mean : 190696 Mean :137570
## 3rd Qu.: 180000 3rd Qu.:175000
## Max. :30400000 Max. :450000
## remote_ratio company_location company_size
## Min. : 0.00 Length:3755 Length:3755
## 1st Qu.: 0.00 Class :character Class :character
## Median : 0.00 Mode :character Mode :character
## Mean : 46.27
## 3rd Qu.:100.00
## Max. :100.00
Country with Most Number of Data Science Jobs
data$company_location = as.factor(data$company_location)
count(data, data$company_location, sort=TRUE)
## # A tibble: 72 × 2
## `data$company_location` n
## <fct> <int>
## 1 US 3040
## 2 GB 172
## 3 CA 87
## 4 ES 77
## 5 IN 58
## 6 DE 56
## 7 FR 34
## 8 BR 15
## 9 AU 14
## 10 GR 14
## # ℹ 62 more rows
Create a New DF for US
usdata = subset(data, company_location == "US")
dim(usdata)
## [1] 3040 11
Central Tendency for US
mean(usdata$salary_in_usd)
## [1] 151822
median(usdata$salary_in_usd)
## [1] 145000
range(usdata$salary_in_usd)
## [1] 5679 450000
var(usdata$salary_in_usd)
## [1] 3138076709
sd(usdata$salary_in_usd)
## [1] 56018.54
Summary Statistics by Experience Level
describeBy(usdata$salary_in_usd, group=usdata$experience_level, mat=TRUE)
## item group1 vars n mean sd median trimmed mad min
## X11 1 EN 1 183 102400.6 45850.50 92700 99471.39 47887.98 12000
## X12 2 EX 1 98 207445.5 65360.84 200000 204225.71 65234.40 100000
## X13 3 MI 1 497 127822.5 50032.38 124000 124431.64 38547.60 5679
## X14 4 SE 1 2262 158683.5 53016.36 150000 155255.79 50408.40 25000
## max range skew kurtosis se
## X11 250000 238000 0.6227810 0.09700679 3389.366
## X12 416000 316000 0.5489788 -0.11319474 6602.442
## X13 450000 444321 1.4045630 5.95811373 2244.259
## X14 412000 387000 0.7649736 1.15075919 1114.714