R派模思
2022-04-10
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: sysfonts
## Loading required package: showtextdb
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
path <- '~/PycharmProjects/WebScraping/Project001/recruitment_engineer/recruitment_engineer/spiders/'
files <- c()
for (i in 1:70){
file <- paste(path,'前程_电子工程师招聘_',i,'.html',sep='')
files <- c(files,file)
}
print(files[3])## [1] "~/PycharmProjects/WebScraping/Project001/recruitment_engineer/recruitment_engineer/spiders/前程_电子工程师招聘_3.html"
## Rows: 50
## Columns: 20
## $ jobid <chr> "139367258", "138715966", "130759689", "133580078",…
## $ coid <chr> "2128929", "2244822", "5968843", "2918382", "656702…
## $ job_href <chr> "https://jobs.51job.com/dongguan-hjz/139367258.html…
## $ job_name <chr> "电子元器件资深工程师", "电子工程师(智能门锁)", "…
## $ job_title <chr> "电子元器件资深工程师", "电子工程师(智能门锁)", "…
## $ company_href <chr> "https://jobs.51job.com/all/co2128929.html", "https…
## $ company_name <chr> "东莞创机电业制品有限公司", "公牛集团股份有限公司",…
## $ providesalary_text <chr> "1.6-2.5万/月", "1.5-2万/月", "1-2万/月", "0.8-1万/…
## $ workarea <chr> "030830", "080307", "041000", "030200", "030829", "…
## $ workarea_text <chr> "东莞-厚街镇", "宁波-慈溪市", "深圳", "广州", "东莞…
## $ updatedate <chr> "04-07", "04-07", "04-07", "04-07", "04-07", "04-07…
## $ companytype_text <chr> "外资(非欧美)", "上市公司", "民营公司", "民营公司…
## $ degreefrom <chr> "6", "6", "5", "5", "5", "6", "6", "5", "5", "6", "…
## $ workyear <chr> "7", "6", "4", "5", "4", "5", "5", "6", "4", "6", "…
## $ issuedate <chr> "2022-04-07 17:11:24", "2022-04-07 19:57:09", "2022…
## $ jobwelf <chr> "五险一金 餐饮补贴 专业培训 年终奖金 定期体检", "带…
## $ companysize_text <chr> "10000人以上", "10000人以上", "1000-5000人", "1000-…
## $ companyind_text <chr> "机械/设备/重工", "电子技术/半导体/集成电路", "家具…
## $ jobwelf_list1 <chr> "五险一金, 餐饮补贴, 专业培训, 年终奖金, 定期体检",…
## $ attribute_text1 <chr> "东莞-厚街镇, 8-9年经验, 本科", "宁波-慈溪市, 5-7年…
## [1] "36 Invalid Page with 0B"
## [1] "本次取数3450条" "本次取数1条"
## Rows: 3,450
## Columns: 20
## $ jobid <chr> "139367258", "138715966", "130759689", "133580078",…
## $ coid <chr> "2128929", "2244822", "5968843", "2918382", "656702…
## $ job_href <chr> "https://jobs.51job.com/dongguan-hjz/139367258.html…
## $ job_name <chr> "电子元器件资深工程师", "电子工程师(智能门锁)", "…
## $ job_title <chr> "电子元器件资深工程师", "电子工程师(智能门锁)", "…
## $ company_href <chr> "https://jobs.51job.com/all/co2128929.html", "https…
## $ company_name <chr> "东莞创机电业制品有限公司", "公牛集团股份有限公司",…
## $ providesalary_text <chr> "1.6-2.5万/月", "1.5-2万/月", "1-2万/月", "0.8-1万/…
## $ workarea <chr> "030830", "080307", "041000", "030200", "030829", "…
## $ workarea_text <chr> "东莞-厚街镇", "宁波-慈溪市", "深圳", "广州", "东莞…
## $ updatedate <chr> "04-07", "04-07", "04-07", "04-07", "04-07", "04-07…
## $ companytype_text <chr> "外资(非欧美)", "上市公司", "民营公司", "民营公司…
## $ degreefrom <chr> "6", "6", "5", "5", "5", "6", "6", "5", "5", "6", "…
## $ workyear <chr> "7", "6", "4", "5", "4", "5", "5", "6", "4", "6", "…
## $ issuedate <chr> "2022-04-07 17:11:24", "2022-04-07 19:57:09", "2022…
## $ jobwelf <chr> "五险一金 餐饮补贴 专业培训 年终奖金 定期体检", "带…
## $ companysize_text <chr> "10000人以上", "10000人以上", "1000-5000人", "1000-…
## $ companyind_text <chr> "机械/设备/重工", "电子技术/半导体/集成电路", "家具…
## $ jobwelf_list1 <chr> "五险一金, 餐饮补贴, 专业培训, 年终奖金, 定期体检",…
## $ attribute_text1 <chr> "东莞-厚街镇, 8-9年经验, 本科", "宁波-慈溪市, 5-7年…
至此,我们就将HTML文件数据转化成了数据表格形式,关于数据清洗将在下节展开。在HTML提取数据时,要注意观察HTML文件中各元素所包含的数据,哪些是我们需要的,定位好元素后,就可很快提取出数据。