This scrapes the data (was previous work from 607) from the Bureau of Labor Statistics.
# Dependencies
suppressWarnings(library(curl, quietly =TRUE))
suppressWarnings(library(stringr, quietly =TRUE))
suppressWarnings(library(XML, quietly =TRUE))
# Downloading Data
#This section downloads a single page. However, it can be modified to work across similar pages. Notice how that would only require changing the code in the URL. TODO get a list of pertinent OES codes.
# From: https://www.bls.gov/oes/current/oes_stru.htm#15-0000
## List of URLS
# IT was fastest to type these by hand
raw.data <- readLines("https://www.bls.gov/oes/current/oes151111.htm")
numbers.list <- c(1133, 1134, 1141, 1142, 1143, 1151, 1152, 1199, 2011, 2021, 2031, 2041, 2090)
urls <- c()
i = 1
for (number in numbers.list){
url = paste(c("https://www.bls.gov/oes/current/oes15",number,".htm"),collapse = "")
urls[i] <- url
i = i + 1
}
urls
## [1] "https://www.bls.gov/oes/current/oes151133.htm"
## [2] "https://www.bls.gov/oes/current/oes151134.htm"
## [3] "https://www.bls.gov/oes/current/oes151141.htm"
## [4] "https://www.bls.gov/oes/current/oes151142.htm"
## [5] "https://www.bls.gov/oes/current/oes151143.htm"
## [6] "https://www.bls.gov/oes/current/oes151151.htm"
## [7] "https://www.bls.gov/oes/current/oes151152.htm"
## [8] "https://www.bls.gov/oes/current/oes151199.htm"
## [9] "https://www.bls.gov/oes/current/oes152011.htm"
## [10] "https://www.bls.gov/oes/current/oes152021.htm"
## [11] "https://www.bls.gov/oes/current/oes152031.htm"
## [12] "https://www.bls.gov/oes/current/oes152041.htm"
## [13] "https://www.bls.gov/oes/current/oes152090.htm"
#Since we're only interested in the first table, we'll cut everything else out.
# Regex for all text between two table
first <- which(grepl("<table border=\"1\"", raw.data))[1]
last <- which(grepl("</table>", raw.data))[1]
truncated.data <- raw.data[first:last]
html.data <- data.frame(readHTMLTable(truncated.data))
colnames(html.data) <- c("No. of Employees", "RSE", "Mean Hourly Wage", "Mean Annual Wage", "Wage RSE")
html.data
## No. of Employees RSE Mean Hourly Wage Mean Annual Wage Wage RSE
## 1 27,920 3.4 % $57.49 $119,570 1.4 %
#Below is an attempt to automate the above process to work with a list of URLs.
oes_scrape <- function(URLs){
big.data <- data.frame()
for (url in URLs){
raw.data <- readLines(url)
first <- which(grepl("<table border=\"1\"", raw.data))[1]
last <- which(grepl("</table>", raw.data))[1]
truncated.data <- raw.data[first:last]
html.data <- data.frame(readHTMLTable(truncated.data))
colnames(html.data) <- c("No.Employees", "RSE", "Mean.Hourly.Wage", "Mean.Annual.Wage", "Wage.RSE")
big.data <- rbind(big.data, html.data)
}
return(big.data)
}
salary.frame <- oes_scrape(urls)
## Warning in grepl("<table border=\"1\"", raw.data): input string 4255 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 4260 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 4265 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 10720 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 10725 is
## invalid in this locale
## Warning in grepl("</table>", raw.data): input string 4255 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 4260 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 4265 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 10720 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 10725 is invalid in
## this locale
SOC <- numbers.list
salary.frame <- cbind(SOC, salary.frame)
tmp <- substr(salary.frame$Mean.Annual.Wage, 2, 15)
tmp <- gsub(x = tmp, pattern = ",", replacement ="")
salary.frame$Mean.Annual.Wage <- as.numeric(tmp)
#write.csv(salary.frame, file="salary.csv")
suppressWarnings(library(rvest, quietly =TRUE))
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
suppressWarnings(library(dplyr, quietly =TRUE))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
suppressWarnings(library(stringr, quietly =TRUE))
suppressWarnings(library(tidyr, quietly =TRUE))
suppressWarnings(library(dplyr, quietly =TRUE))
suppressWarnings(library(ggplot2, quietly =TRUE))
suppressWarnings(library(curl, quietly =TRUE))
############################################
# Uncomment the below to download again
curl_download("https://www.bls.gov/emp/tables/occupational-projections-and-characteristics.htm", "outlook.html")
BLS_EP_URL <- read_html("outlook.html")
OccProj <- html_nodes(BLS_EP_URL, "table")
head(OccProj)
## {xml_nodeset (2)}
## [1] <table id="main-content-table"><tr>\n<td id="secondary-nav-td">\r\n\ ...
## [2] <table class="regular" cellspacing="0" cellpadding="0" xborder="1">\ ...
OccProj <- BLS_EP_URL %>%
html_nodes("table") %>%
.[2] %>%
html_table(fill = TRUE)
OccProj[[1]] <- OccProj[[1]][-1,]
colnames(OccProj[[1]]) <- c("Title", "SOC", "OccupationType", "2016Employment", "2026Employment", "2016EmplChange2016-26", "2026EmplChange2016-26", "2016Self-Empl_Prcnt", "2016-26_AvgAnnual_OccOpenings", "2017MedianAnnualWage", "TypicalEntryLvlEduc", "PreEmplExperience", "PostEmplTraining")
OccProjTbl <- dplyr::tbl_df(OccProj[[1]])
outlook.frame <- subset(OccProjTbl, (str_detect(OccProjTbl$SOC, '15-')))
outlook.frame$SOC <- substr(outlook.frame$SOC, 4, 7)
Series15 <- dplyr::filter(OccProjTbl, grepl('15-', SOC)) %>%
filter(grepl('Line item', OccupationType))
Series15$SOC <- substr(Series15$SOC, 4, 7)
characteristics.graphic2 <- ggplot(Series15, aes(x = TypicalEntryLvlEduc, y = frequency(SOC), fill=TypicalEntryLvlEduc)) +
guides(fill=FALSE, color=FALSE)+
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1")+
labs(title = "Typical education needed for entry for 15-000 Computer Occupation", x = "Education", y = "Frequency")
characteristics.graphic <- ggplot(Series15, aes(x = PreEmplExperience, y = frequency(SOC), fill=PreEmplExperience)) +
guides(fill=FALSE, color=FALSE)+
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1")+
labs(title = "Work experience in a related occupation for 15-000 Computer Occupation", x = "Experience", y = "Frequency")
outlook.frame
## # A tibble: 27 x 13
## Title SOC OccupationType `2016Employment` `2026Employment`
## <chr> <chr> <chr> <chr> <chr>
## 1 Comp… 0000 Summary 4,419.0 5,026.5
## 2 Comp… 1100 Summary 4,238.4 4,795.5
## 3 Comp… 1111 Line item 27.9 33.2
## 4 Comp… 1120 Summary 700.5 783.4
## 5 Comp… 1121 Line item 600.5 654.9
## 6 Info… 1122 Line item 100.0 128.5
## 7 Soft… 1130 Summary 1,714.0 2,019.6
## 8 Comp… 1131 Line item 294.9 273.6
## 9 Soft… 1132 Line item 831.3 1,086.6
## 10 Soft… 1133 Line item 425.0 472.1
## # ... with 17 more rows, and 8 more variables:
## # `2016EmplChange2016-26` <chr>, `2026EmplChange2016-26` <chr>,
## # `2016Self-Empl_Prcnt` <chr>, `2016-26_AvgAnnual_OccOpenings` <chr>,
## # `2017MedianAnnualWage` <chr>, TypicalEntryLvlEduc <chr>,
## # PreEmplExperience <chr>, PostEmplTraining <chr>
This cleans the two columns we’re interested in.
salary.vs.outlook <- na.omit(merge(salary.frame, outlook.frame, by = 'SOC', all = TRUE))
dollars <- as.numeric(as.character(salary.vs.outlook$Mean.Annual.Wage))
jobs <- as.numeric(as.character(salary.vs.outlook$`2016-26_AvgAnnual_OccOpenings`))
title <- as.character(outlook.frame$Title)
df <- data.frame(cbind(title, SOC, dollars, jobs))
## Warning in cbind(title, SOC, dollars, jobs): number of rows of result is
## not a multiple of vector length (arg 2)
Here I built a linear model
linear.model = lm(jobs~dollars)
plot(dollars, jobs)
abline(linear.model, col="blue", lwd=2)
Here I plot and test residuals.
residual <- residuals(linear.model)
residual <- as.data.frame(residual)
ggplot(residual, aes(residual)) + geom_histogram(fill = 'red')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here we see a weak linear relationship between projected job growth and average salary for a given occupation.
summary(linear.model)
##
## Call:
## lm(formula = jobs ~ dollars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.155 -7.092 -5.994 7.365 27.058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.0988356 21.2629070 2.262 0.0449 *
## dollars -0.0003630 0.0002357 -1.540 0.1518
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.79 on 11 degrees of freedom
## Multiple R-squared: 0.1774, Adjusted R-squared: 0.1026
## F-statistic: 2.372 on 1 and 11 DF, p-value: 0.1518
This visually confirms the weak linear relationship.
qqnorm(resid(linear.model))
qqline(resid(linear.model))
I learned about this test from my classmate, with some follow up research online. Furthermore, we cannot reject the null hypothesis that the sample comes from a normally distributed population.
shapiro.test(linear.model$residuals)
##
## Shapiro-Wilk normality test
##
## data: linear.model$residuals
## W = 0.89888, p-value = 0.1291