学习R语言数据可视化就是要不断地重复,我今天要开始一个系列的联系,所有的数据都来自美国劳工统计局

采集数据

涉及到简单的网页表格抓取。

library(RCurl)
## Loading required package: bitops
library(XML)
web<- "https://www.bls.gov/opub/ted/2018/higher-wage-workers-more-likely-than-lower-wage-workers-to-have-paid-leave-benefits-in-2018.htm"
#获取网页源码
webcode<- getURL(web)
#转换类型
webhtml<- htmlParse(webcode, asText = T)
tables<- readHTMLTable(webhtml, colClasses = c("character" , "numeric" , "numeric", "numeric"))
## Warning in asMethod(object): 强制改变过程中产生了NA

## Warning in asMethod(object): 强制改变过程中产生了NA

## Warning in asMethod(object): 强制改变过程中产生了NA
class(tables)
## [1] "list"
tables<- tables[[1]]
class(tables)
## [1] "data.frame"
tables
##        Wage category Paid sick leave Paid vacation Paid holidays
## 1        All workers              NA            NA            NA
## 2  Lowest 10 percent              31            41            40
## 3  Lowest 25 percent              45            52            54
## 4  Second 25 percent              73            83            82
## 5   Third 25 percent              83            90            91
## 6 Highest 25 percent              90            91            93
## 7 Highest 10 percent              92            92            93

整理数据

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
tables<- tables[-1,]
tables<- gather(tables,key = benefits_type, value = percent, -1)
names(tables)<- c('Class', 'Type' , 'Percent')
tables
##                 Class            Type Percent
## 1   Lowest 10 percent Paid sick leave      31
## 2   Lowest 25 percent Paid sick leave      45
## 3   Second 25 percent Paid sick leave      73
## 4    Third 25 percent Paid sick leave      83
## 5  Highest 25 percent Paid sick leave      90
## 6  Highest 10 percent Paid sick leave      92
## 7   Lowest 10 percent   Paid vacation      41
## 8   Lowest 25 percent   Paid vacation      52
## 9   Second 25 percent   Paid vacation      83
## 10   Third 25 percent   Paid vacation      90
## 11 Highest 25 percent   Paid vacation      91
## 12 Highest 10 percent   Paid vacation      92
## 13  Lowest 10 percent   Paid holidays      40
## 14  Lowest 25 percent   Paid holidays      54
## 15  Second 25 percent   Paid holidays      82
## 16   Third 25 percent   Paid holidays      91
## 17 Highest 25 percent   Paid holidays      93
## 18 Highest 10 percent   Paid holidays      93

绘制图形

library(ggplot2)
library(RColorBrewer)
mycolors<- brewer.pal(3,"Set3")
p<- ggplot(tables, aes(x = reorder(Class, Percent), y = Percent , fill = Type))
p+ geom_bar(stat = "identity" , width = 0.7, position = position_dodge(0.8)) + xlab('') +ylab('') +
  scale_fill_manual(values =mycolors) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,100)) + 
  labs(subtitle = "Higher wage workers more likely than lower wage workers \n to have paid leave benefits in 2018",
       title = "Percent of worders with access to paid benefits") + 
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(color = "black" , size = 1),
        axis.text.x = element_text(angle = 30))