R派模思
2022-04-08
本次市场薪酬调研小项目示例,如下说明:
目标岗位:电子工程师
研究方法:利用开源工具,爬取招聘岗站数据,分析数据。
开源工具:Scrapy, Rstudio
数据分析语言: R
取样数据:样本数据360条记录,有效355条记录
备注:薪酬调研数据来源,应利用好各大主流招聘网站数据,爬取数据量对HR来说,贵精不贵多,须评估数据质量。
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: sysfonts
## Loading required package: showtextdb
## # A tibble: 6 × 9
## company company_attribu… job joblink location recruiter_name recruiter_title
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 天邦达… 电子/芯片/半导… 高级… https:… 深圳-宝… 陈女士,游女士… 主管,综合管理,…
## 2 成都世… 电子/芯片/半导… 电子… https:… 成都,苏… 贺女士,徐女士… 人事经理,人力…
## 3 深圳市… 电子/芯片/半导… 电子… https:… 杭州-滨… 周先生,朱女士… 招聘主管,招聘…
## 4 山东英… 制药/生物工程,1… 数字… https:… 北京-海… 田女士,刘女士… 招聘专员,人力…
## 5 苏州医… 电子设备,1-49人… 电子… https:… 杭州-萧… 沈先生,沈女士… 人事主管,行政…
## 6 上海格… 制药/生物工程,… 电子… https:… 上海-浦… 王女士,王女士… 人事,招聘专员,…
## # … with 2 more variables: requirements <chr>, salary <chr>
抽取第一行数据
com <- elec_engineer[1,1] %>% str_split(',') %>% unlist
job <- elec_engineer[1,3] %>% str_split(',') %>% unlist
joblink <- elec_engineer[1,4] %>% str_split(',') %>% unlist
jobid <- str_extract(joblink,'[0-9]*.shtml') %>% parse_number()
location <- elec_engineer[1,5] %>% str_split(',') %>% unlist
recruiter_name <- elec_engineer[1,6] %>% str_split(',') %>% unlist
recruiter_title <- elec_engineer[1,7] %>% str_split(',') %>% unlist
salary <- elec_engineer[1,9] %>% str_split(',') %>% unlist
salary[1:5]## [1] "15-25k" "15-30k" "15-35k·13薪" "15-35k·13薪" "15-30k"查看数据长度是否一致,如有不一致,补齐空白使齐一致。
## [1] "com" "job" "joblink" "jobid"
## [5] "location" "recruiter_name" "recruiter_title" "salary"
## [1] 40 40 40 40 40 40 32 40
## [1] "recruiter_title1" "40"数据长度一致后,建立数据表
## # A tibble: 6 × 8
## job com location salary joblink jobid recruiter_name recruiter_title1
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 高级电子… 天邦… 深圳-宝… 15-25k https:… 1.91e9 陈女士 主管
## 2 电子工程… 浙江… 长沙-岳… 15-30k https:… 1.94e9 游女士 综合管理
## 3 电力电子… 光亚… 苏州-虎… 15-35… https:… 1.95e9 易先生 人力资源经理
## 4 电力电子… 光亚… 东莞-望… 15-35… https:… 1.95e9 易先生 人力资源经理
## 5 电力电子… 光亚… 东莞 15-30k https:… 1.95e9 陈女士 人资经理
## 6 电子工程… 红品… 深圳-宝… 10-15… https:… 1.93e9 朱女士 CEO## [1] "2" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "37" "salary_len" "40"
## [1] "3" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "36" "salary_len" "40"
## [1] "4" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "29" "salary_len" "40"
## [1] "5" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "39" "salary_len" "40"
## [1] "6" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "38" "salary_len" "40"
## [1] "7" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "30" "salary_len" "40"
## [1] "8" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "33" "salary_len" "40"
## [1] "9" "com_len" "40"
## [4] "job_len" "40" "joblink_len"
## [7] "40" "location_len" "40"
## [10] "recruitment_name_len" "40" "recruitment_title_len"
## [13] "32" "salary_len" "40"
for (i in 2:9){
com <- elec_engineer[i,1] %>% str_split(',') %>% unlist
job <- elec_engineer[i,3] %>% str_split(',') %>% unlist
joblink <- elec_engineer[i,4] %>% str_split(',') %>% unlist
jobid <- str_extract(joblink,'[0-9]*.shtml') %>% parse_number()
location <- elec_engineer[i,5] %>% str_split(',') %>% unlist
recruiter_name <- elec_engineer[i,6] %>% str_split(',') %>% unlist
recruiter_title <- elec_engineer[i,7] %>% str_split(',') %>% unlist
salary <- elec_engineer[i,9] %>% str_split(',') %>% unlist
if (length(recruiter_title) != 40){
gap <- 40 - length(recruiter_title)
recruiter_title1 <- c(recruiter_title,rep('未提供',gap))
}
df1 <- cbind(job,com,location,salary,joblink,jobid,recruiter_name,recruiter_title1)
df <- rbind(df,df1)
}
df %>% glimpse## Rows: 360
## Columns: 8
## $ job <chr> "高级电子工程师", "电子工程师", "电力电子软件研发工程…
## $ com <chr> "天邦达科技", "浙江巨磁智能技术有限公司", "光亚智能",…
## $ location <chr> "深圳-宝安区", "长沙-岳麓区", "苏州-虎丘区", "东莞-望…
## $ salary <chr> "15-25k", "15-30k", "15-35k·13薪", "15-35k·13薪", "15…
## $ joblink <chr> "https://www.liepin.com/job/1914205188.shtml?d_sfrom=…
## $ jobid <chr> "1914205188", "1937969367", "1945326843", "1945193363…
## $ recruiter_name <chr> "陈女士", "游女士", "易先生", "易先生", "陈女士", "朱…
## $ recruiter_title1 <chr> "主管", "综合管理", "人力资源经理", "人力资源经理", "…
## Rows: 360
## Columns: 9
## $ job <chr> "高级电子工程师", "电子工程师", "电力电子软件研发工程…
## $ com <chr> "天邦达科技", "浙江巨磁智能技术有限公司", "光亚智能",…
## $ location <chr> "深圳-宝安区", "长沙-岳麓区", "苏州-虎丘区", "东莞-望…
## $ salary <chr> "15-25k", "15-30k", "15-35k·13薪", "15-35k·13薪", "15…
## $ joblink <chr> "https://www.liepin.com/job/1914205188.shtml?d_sfrom=…
## $ jobid <chr> "1914205188", "1937969367", "1945326843", "1945193363…
## $ recruiter_name <chr> "陈女士", "游女士", "易先生", "易先生", "陈女士", "朱…
## $ recruiter_title1 <chr> "主管", "综合管理", "人力资源经理", "人力资源经理", "…
## $ city <chr> "深圳", "长沙", "苏州", "东莞", "东莞", "深圳", "嘉兴…
ind <- which(df$salary=='面议') #将薪酬为面议替换为0-0形式方便值拆分
value <- rep('0-0',length(ind))
df$salary <- replace(df$salary,ind,value)
df$salaryMin <- strsplit(df$salary,'·') %>% sapply('[[',1) %>% str_split('-') %>% sapply('[[',1) %>%
as.numeric()*1000
df$salaryMax <- strsplit(df$salary,'·') %>% sapply('[[',1) %>% str_split('-') %>%
sapply('[[',2) %>% parse_number()*1000
df$salaryAvg <- (df$salaryMin+df$salaryMax)/2
df %>% glimpse## Rows: 360
## Columns: 12
## $ job <chr> "高级电子工程师", "电子工程师", "电力电子软件研发工程…
## $ com <chr> "天邦达科技", "浙江巨磁智能技术有限公司", "光亚智能",…
## $ location <chr> "深圳-宝安区", "长沙-岳麓区", "苏州-虎丘区", "东莞-望…
## $ salary <chr> "15-25k", "15-30k", "15-35k·13薪", "15-35k·13薪", "15…
## $ joblink <chr> "https://www.liepin.com/job/1914205188.shtml?d_sfrom=…
## $ jobid <chr> "1914205188", "1937969367", "1945326843", "1945193363…
## $ recruiter_name <chr> "陈女士", "游女士", "易先生", "易先生", "陈女士", "朱…
## $ recruiter_title1 <chr> "主管", "综合管理", "人力资源经理", "人力资源经理", "…
## $ city <chr> "深圳", "长沙", "苏州", "东莞", "东莞", "深圳", "嘉兴…
## $ salaryMin <dbl> 15000, 15000, 15000, 15000, 15000, 10000, 15000, 6000…
## $ salaryMax <dbl> 25000, 30000, 35000, 35000, 30000, 15000, 25000, 1200…
## $ salaryAvg <dbl> 20000, 22500, 25000, 25000, 22500, 12500, 20000, 9000…
ind <- which(duplicated(df$jobid)==TRUE)
df[-ind,] %>% group_by(city) %>% summarize(n=n(),avg_salary=mean(salaryAvg)) %>%
arrange(-n) %>% head %>% ggplot(aes(x=city))+geom_col(aes(y=avg_salary))+geom_point(aes(y=n*100),color='red')+geom_text(aes(label=n,x=city,y=n*10),color='red')+
geom_text(aes(label=round(avg_salary,0),x=city,y=avg_salary+1e+3),color='green')+
labs(y='平均月薪',x='城市',title='电子工程师招聘需求最大的前六城市')+scale_y_continuous(sec.axis = sec_axis(~./100))电子工程师招聘需求主要集中在江浙和广东地区,如上海,苏州,深圳;其中,该岗位在上海的平均月薪最高,可达19790元;在深圳为15575元,苏州16298元。
招聘要求多为大专或本科学历,3-5年工作经验。
用人行业多集中在电子/芯片/半导体,医疗器械,企业规模多以100-500人为主。