This scrapes the data (was previous work from 607) from the Bureau of Labor Statistics.


# Dependencies

suppressWarnings(library(curl, quietly =TRUE))
suppressWarnings(library(stringr, quietly =TRUE))
suppressWarnings(library(XML, quietly =TRUE))

# Downloading Data

#This section downloads a single page. However, it can be modified to work across similar pages. Notice how that would only require changing the code  in the URL. TODO get a list of pertinent OES codes. 

# From: https://www.bls.gov/oes/current/oes_stru.htm#15-0000
##  List of URLS
# IT was fastest to type these by hand
raw.data <- readLines("https://www.bls.gov/oes/current/oes151111.htm")
numbers.list <- c(1133, 1134, 1141, 1142, 1143, 1151, 1152, 1199,  2011, 2021, 2031, 2041, 2090)
urls <- c()
i = 1
for (number in numbers.list){
  url = paste(c("https://www.bls.gov/oes/current/oes15",number,".htm"),collapse = "")
  urls[i] <- url
  i = i + 1
}
urls
##  [1] "https://www.bls.gov/oes/current/oes151133.htm"
##  [2] "https://www.bls.gov/oes/current/oes151134.htm"
##  [3] "https://www.bls.gov/oes/current/oes151141.htm"
##  [4] "https://www.bls.gov/oes/current/oes151142.htm"
##  [5] "https://www.bls.gov/oes/current/oes151143.htm"
##  [6] "https://www.bls.gov/oes/current/oes151151.htm"
##  [7] "https://www.bls.gov/oes/current/oes151152.htm"
##  [8] "https://www.bls.gov/oes/current/oes151199.htm"
##  [9] "https://www.bls.gov/oes/current/oes152011.htm"
## [10] "https://www.bls.gov/oes/current/oes152021.htm"
## [11] "https://www.bls.gov/oes/current/oes152031.htm"
## [12] "https://www.bls.gov/oes/current/oes152041.htm"
## [13] "https://www.bls.gov/oes/current/oes152090.htm"

#Since we're only interested in the first table, we'll cut everything else out.

# Regex for all text between two table
first <- which(grepl("<table border=\"1\"", raw.data))[1]
last <- which(grepl("</table>", raw.data))[1]
truncated.data <- raw.data[first:last]



html.data <- data.frame(readHTMLTable(truncated.data))
colnames(html.data) <- c("No. of Employees", "RSE", "Mean Hourly Wage", "Mean Annual Wage", "Wage RSE")
html.data
##   No. of Employees   RSE Mean Hourly Wage Mean Annual Wage Wage RSE
## 1           27,920 3.4 %           $57.49         $119,570    1.4 %


#Below is an attempt to automate the above process to work with a list of URLs. 

oes_scrape <- function(URLs){
  big.data <- data.frame()
  for (url in URLs){
    raw.data <- readLines(url)
    first <- which(grepl("<table border=\"1\"", raw.data))[1]
    last <- which(grepl("</table>", raw.data))[1]
    truncated.data <- raw.data[first:last]
    html.data <- data.frame(readHTMLTable(truncated.data))
    colnames(html.data) <- c("No.Employees", "RSE", "Mean.Hourly.Wage", "Mean.Annual.Wage", "Wage.RSE")
    big.data <- rbind(big.data, html.data)
  }
  return(big.data)
  
  
}
salary.frame <- oes_scrape(urls)
## Warning in grepl("<table border=\"1\"", raw.data): input string 4255 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 4260 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 4265 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 10720 is
## invalid in this locale
## Warning in grepl("<table border=\"1\"", raw.data): input string 10725 is
## invalid in this locale
## Warning in grepl("</table>", raw.data): input string 4255 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 4260 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 4265 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 10720 is invalid in
## this locale
## Warning in grepl("</table>", raw.data): input string 10725 is invalid in
## this locale
SOC <- numbers.list
salary.frame <- cbind(SOC, salary.frame)

tmp <- substr(salary.frame$Mean.Annual.Wage, 2, 15)
tmp <- gsub(x = tmp, pattern = ",", replacement ="")
salary.frame$Mean.Annual.Wage <- as.numeric(tmp)

#write.csv(salary.frame, file="salary.csv")

suppressWarnings(library(rvest, quietly =TRUE))
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
suppressWarnings(library(dplyr, quietly =TRUE))
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
suppressWarnings(library(stringr, quietly =TRUE))
suppressWarnings(library(tidyr, quietly =TRUE))
suppressWarnings(library(dplyr, quietly =TRUE))
suppressWarnings(library(ggplot2, quietly =TRUE))
suppressWarnings(library(curl, quietly =TRUE))

############################################
# Uncomment the below to download again

curl_download("https://www.bls.gov/emp/tables/occupational-projections-and-characteristics.htm", "outlook.html")
BLS_EP_URL <- read_html("outlook.html")


OccProj <- html_nodes(BLS_EP_URL, "table")
head(OccProj)
## {xml_nodeset (2)}
## [1] <table id="main-content-table"><tr>\n<td id="secondary-nav-td">\r\n\ ...
## [2] <table class="regular" cellspacing="0" cellpadding="0" xborder="1">\ ...

OccProj <- BLS_EP_URL %>%
  html_nodes("table") %>%
  .[2] %>%
  html_table(fill = TRUE)


OccProj[[1]] <- OccProj[[1]][-1,]

colnames(OccProj[[1]]) <- c("Title", "SOC", "OccupationType", "2016Employment", "2026Employment", "2016EmplChange2016-26", "2026EmplChange2016-26", "2016Self-Empl_Prcnt", "2016-26_AvgAnnual_OccOpenings", "2017MedianAnnualWage", "TypicalEntryLvlEduc", "PreEmplExperience", "PostEmplTraining")  

OccProjTbl <- dplyr::tbl_df(OccProj[[1]]) 

outlook.frame <- subset(OccProjTbl, (str_detect(OccProjTbl$SOC, '15-')))
outlook.frame$SOC <- substr(outlook.frame$SOC, 4, 7)

Series15 <- dplyr::filter(OccProjTbl, grepl('15-', SOC)) %>%
  filter(grepl('Line item', OccupationType))
Series15$SOC <- substr(Series15$SOC, 4, 7)

characteristics.graphic2 <- ggplot(Series15, aes(x = TypicalEntryLvlEduc, y = frequency(SOC), fill=TypicalEntryLvlEduc)) + 
  guides(fill=FALSE, color=FALSE)+
  geom_bar(stat="identity") +
  scale_fill_brewer(palette="Set1")+
  labs(title = "Typical education needed for entry for 15-000 Computer Occupation", x = "Education", y = "Frequency") 

characteristics.graphic <- ggplot(Series15, aes(x = PreEmplExperience, y = frequency(SOC), fill=PreEmplExperience)) + 
  guides(fill=FALSE, color=FALSE)+
  geom_bar(stat="identity") +
  scale_fill_brewer(palette="Set1")+
  labs(title = "Work experience in a related occupation for 15-000 Computer Occupation", x = "Experience", y = "Frequency") 

outlook.frame
## # A tibble: 27 x 13
##    Title SOC   OccupationType `2016Employment` `2026Employment`
##    <chr> <chr> <chr>          <chr>            <chr>           
##  1 Comp… 0000  Summary        4,419.0          5,026.5         
##  2 Comp… 1100  Summary        4,238.4          4,795.5         
##  3 Comp… 1111  Line item      27.9             33.2            
##  4 Comp… 1120  Summary        700.5            783.4           
##  5 Comp… 1121  Line item      600.5            654.9           
##  6 Info… 1122  Line item      100.0            128.5           
##  7 Soft… 1130  Summary        1,714.0          2,019.6         
##  8 Comp… 1131  Line item      294.9            273.6           
##  9 Soft… 1132  Line item      831.3            1,086.6         
## 10 Soft… 1133  Line item      425.0            472.1           
## # ... with 17 more rows, and 8 more variables:
## #   `2016EmplChange2016-26` <chr>, `2026EmplChange2016-26` <chr>,
## #   `2016Self-Empl_Prcnt` <chr>, `2016-26_AvgAnnual_OccOpenings` <chr>,
## #   `2017MedianAnnualWage` <chr>, TypicalEntryLvlEduc <chr>,
## #   PreEmplExperience <chr>, PostEmplTraining <chr>

This cleans the two columns we’re interested in.

salary.vs.outlook <- na.omit(merge(salary.frame, outlook.frame, by = 'SOC', all = TRUE))
dollars <- as.numeric(as.character(salary.vs.outlook$Mean.Annual.Wage))
jobs <- as.numeric(as.character(salary.vs.outlook$`2016-26_AvgAnnual_OccOpenings`))
title <- as.character(outlook.frame$Title)
df <- data.frame(cbind(title, SOC, dollars, jobs))
## Warning in cbind(title, SOC, dollars, jobs): number of rows of result is
## not a multiple of vector length (arg 2)

Here I built a linear model

linear.model = lm(jobs~dollars)
plot(dollars, jobs) 
abline(linear.model, col="blue", lwd=2)

Here I plot and test residuals.

residual <- residuals(linear.model)
residual <- as.data.frame(residual)
ggplot(residual, aes(residual)) + geom_histogram(fill = 'red')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Here we see a weak linear relationship between projected job growth and average salary for a given occupation.

summary(linear.model)
## 
## Call:
## lm(formula = jobs ~ dollars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.155  -7.092  -5.994   7.365  27.058 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 48.0988356 21.2629070   2.262   0.0449 *
## dollars     -0.0003630  0.0002357  -1.540   0.1518  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.79 on 11 degrees of freedom
## Multiple R-squared:  0.1774, Adjusted R-squared:  0.1026 
## F-statistic: 2.372 on 1 and 11 DF,  p-value: 0.1518

This visually confirms the weak linear relationship.

qqnorm(resid(linear.model))
qqline(resid(linear.model))

I learned about this test from my classmate, with some follow up research online. Furthermore, we cannot reject the null hypothesis that the sample comes from a normally distributed population.

shapiro.test(linear.model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  linear.model$residuals
## W = 0.89888, p-value = 0.1291