学习R语言数据可视化就是要不断地重复,我今天要开始一个系列的联系,所有的数据都来自美国劳工统计局
涉及到简单的网页表格抓取。
library(RCurl)
## Loading required package: bitops
library(XML)
web<- "https://www.bls.gov/opub/ted/2018/higher-wage-workers-more-likely-than-lower-wage-workers-to-have-paid-leave-benefits-in-2018.htm"
#获取网页源码
webcode<- getURL(web)
#转换类型
webhtml<- htmlParse(webcode, asText = T)
tables<- readHTMLTable(webhtml, colClasses = c("character" , "numeric" , "numeric", "numeric"))
## Warning in asMethod(object): 强制改变过程中产生了NA
## Warning in asMethod(object): 强制改变过程中产生了NA
## Warning in asMethod(object): 强制改变过程中产生了NA
class(tables)
## [1] "list"
tables<- tables[[1]]
class(tables)
## [1] "data.frame"
tables
## Wage category Paid sick leave Paid vacation Paid holidays
## 1 All workers NA NA NA
## 2 Lowest 10 percent 31 41 40
## 3 Lowest 25 percent 45 52 54
## 4 Second 25 percent 73 83 82
## 5 Third 25 percent 83 90 91
## 6 Highest 25 percent 90 91 93
## 7 Highest 10 percent 92 92 93
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
tables<- tables[-1,]
tables<- gather(tables,key = benefits_type, value = percent, -1)
names(tables)<- c('Class', 'Type' , 'Percent')
tables
## Class Type Percent
## 1 Lowest 10 percent Paid sick leave 31
## 2 Lowest 25 percent Paid sick leave 45
## 3 Second 25 percent Paid sick leave 73
## 4 Third 25 percent Paid sick leave 83
## 5 Highest 25 percent Paid sick leave 90
## 6 Highest 10 percent Paid sick leave 92
## 7 Lowest 10 percent Paid vacation 41
## 8 Lowest 25 percent Paid vacation 52
## 9 Second 25 percent Paid vacation 83
## 10 Third 25 percent Paid vacation 90
## 11 Highest 25 percent Paid vacation 91
## 12 Highest 10 percent Paid vacation 92
## 13 Lowest 10 percent Paid holidays 40
## 14 Lowest 25 percent Paid holidays 54
## 15 Second 25 percent Paid holidays 82
## 16 Third 25 percent Paid holidays 91
## 17 Highest 25 percent Paid holidays 93
## 18 Highest 10 percent Paid holidays 93
library(ggplot2)
library(RColorBrewer)
mycolors<- brewer.pal(3,"Set3")
p<- ggplot(tables, aes(x = reorder(Class, Percent), y = Percent , fill = Type))
p+ geom_bar(stat = "identity" , width = 0.7, position = position_dodge(0.8)) + xlab('') +ylab('') +
scale_fill_manual(values =mycolors) +
scale_y_continuous(expand = c(0,0), limits = c(0,100)) +
labs(subtitle = "Higher wage workers more likely than lower wage workers \n to have paid leave benefits in 2018",
title = "Percent of worders with access to paid benefits") +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(color = "black" , size = 1),
axis.text.x = element_text(angle = 30))