Weekly Challenge 2

Import the data

#load necessary libraries)
library(readr)
library(dplyr)
library(psych)

#read the data
data = read_csv("ds_salaries.csv")
head(data)

## # A tibble: 6 × 11
##   work_year experience_level employment_type job_title    salary salary_currency
##       <dbl> <chr>            <chr>           <chr>         <dbl> <chr>          
## 1      2023 SE               FT              Principal D…  80000 EUR            
## 2      2023 MI               CT              ML Engineer   30000 USD            
## 3      2023 MI               CT              ML Engineer   25500 USD            
## 4      2023 SE               FT              Data Scient… 175000 USD            
## 5      2023 SE               FT              Data Scient… 120000 USD            
## 6      2023 SE               FT              Applied Sci… 222200 USD            
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## #   remote_ratio <dbl>, company_location <chr>, company_size <chr>

Overview of the Data

summary(data)

##    work_year    experience_level   employment_type     job_title        
##  Min.   :2020   Length:3755        Length:3755        Length:3755       
##  1st Qu.:2022   Class :character   Class :character   Class :character  
##  Median :2022   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2022                                                           
##  3rd Qu.:2023                                                           
##  Max.   :2023                                                           
##      salary         salary_currency    salary_in_usd    employee_residence
##  Min.   :    6000   Length:3755        Min.   :  5132   Length:3755       
##  1st Qu.:  100000   Class :character   1st Qu.: 95000   Class :character  
##  Median :  138000   Mode  :character   Median :135000   Mode  :character  
##  Mean   :  190696                      Mean   :137570                     
##  3rd Qu.:  180000                      3rd Qu.:175000                     
##  Max.   :30400000                      Max.   :450000                     
##   remote_ratio    company_location   company_size      
##  Min.   :  0.00   Length:3755        Length:3755       
##  1st Qu.:  0.00   Class :character   Class :character  
##  Median :  0.00   Mode  :character   Mode  :character  
##  Mean   : 46.27                                        
##  3rd Qu.:100.00                                        
##  Max.   :100.00

Country with Most Number of Data Science Jobs

data$company_location = as.factor(data$company_location)
count(data, data$company_location, sort=TRUE)

## # A tibble: 72 × 2
##    `data$company_location`     n
##    <fct>                   <int>
##  1 US                       3040
##  2 GB                        172
##  3 CA                         87
##  4 ES                         77
##  5 IN                         58
##  6 DE                         56
##  7 FR                         34
##  8 BR                         15
##  9 AU                         14
## 10 GR                         14
## # ℹ 62 more rows

Create a New DF for US

usdata = subset(data, company_location == "US")
dim(usdata)

## [1] 3040   11

Central Tendency for US

mean(usdata$salary_in_usd)

## [1] 151822

median(usdata$salary_in_usd)

## [1] 145000

range(usdata$salary_in_usd)

## [1]   5679 450000

var(usdata$salary_in_usd)

## [1] 3138076709

sd(usdata$salary_in_usd)

## [1] 56018.54

Summary Statistics by Experience Level

describeBy(usdata$salary_in_usd, group=usdata$experience_level, mat=TRUE)

##     item group1 vars    n     mean       sd median   trimmed      mad    min
## X11    1     EN    1  183 102400.6 45850.50  92700  99471.39 47887.98  12000
## X12    2     EX    1   98 207445.5 65360.84 200000 204225.71 65234.40 100000
## X13    3     MI    1  497 127822.5 50032.38 124000 124431.64 38547.60   5679
## X14    4     SE    1 2262 158683.5 53016.36 150000 155255.79 50408.40  25000
##        max  range      skew    kurtosis       se
## X11 250000 238000 0.6227810  0.09700679 3389.366
## X12 416000 316000 0.5489788 -0.11319474 6602.442
## X13 450000 444321 1.4045630  5.95811373 2244.259
## X14 412000 387000 0.7649736  1.15075919 1114.714

Weekly Challenge 2

Mingun Kim

2024-09-08

Import the data

Overview of the Data

Country with Most Number of Data Science Jobs

Create a New DF for US

Central Tendency for US

Summary Statistics by Experience Level