Employment-Based Immigration and Citizenship Data
Description from https://www.uscis.gov/tools/reports-studies/immigration-forms-data is as follows:
“This section contains reports on the number of petitions and applications for temporary or lawful permanent resident status based on needed job skills.
Transparency for U.S. Workers
H-1B Datasets: These datasets provide information about the hiring practices of employers who petition for foreign national workers."
There are several frames in this data set. The CSV file does not appear to follow a dataframe-compatible format.
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(utils)
filename <- "C:/Users/vikas/cuny/data607/projects/Project_2/H1B_Data/h-1b-2007-2017-trend-tables.csv"
con <- file(filename, "r")
whole <- readLines(con)
lines <- unlist(str_split(whole[7], ","))
years <- as.integer(lines[2:12])
years
## [1] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
yearsStr <- c("Y2007", "Y2008", "Y2009", "Y2010",
"Y2011", "Y2012", "Y2013", "Y2014",
"Y2015", "Y2016", "Y2017")
This dataset contains a number of tables pertaining to employment-related immigration, such as beneficiaries by Country, Age, Occupation and Industry. Here we shall restrict our analysis to the data based on Compensation and based on Education levels.
Compensation dataframe:
close(con)
con <- file(filename, "r")
compensation <- read.csv(con, quote = "\"", sep = ",", nrows = 9, skip = 120,
header = FALSE, stringsAsFactors = FALSE)
# Discard the last column which contains blanks.
compensation <- compensation %>% select(-V13)
# Set the column names corresponding to each year
compensation <- setNames(compensation, c("SalaryRange", yearsStr))
# Discard the commas and convert to numeric
compensation$Y2007 <- as.numeric(gsub(",", "", compensation$Y2007))
compensation$Y2008 <- as.numeric(gsub(",", "", compensation$Y2008))
compensation$Y2009 <- as.numeric(gsub(",", "", compensation$Y2009))
compensation$Y2010 <- as.numeric(gsub(",", "", compensation$Y2010))
compensation$Y2011 <- as.numeric(gsub(",", "", compensation$Y2011))
compensation$Y2012 <- as.numeric(gsub(",", "", compensation$Y2012))
compensation$Y2013 <- as.numeric(gsub(",", "", compensation$Y2013))
compensation$Y2014 <- as.numeric(gsub(",", "", compensation$Y2014))
compensation$Y2015 <- as.numeric(gsub(",", "", compensation$Y2015))
compensation$Y2016 <- as.numeric(gsub(",", "", compensation$Y2016))
compensation$Y2017 <- as.numeric(gsub(",", "", compensation$Y2017))
compensation
## SalaryRange Y2007 Y2008 Y2009 Y2010 Y2011 Y2012 Y2013
## 1 Below 25,000 5704 5061 7066 5628 6236 5999 4376
## 2 25,000 to 49,999 75047 59642 51630 41772 36361 34103 26813
## 3 50,000 to 74,999 135727 128802 102781 105306 111649 140780 128858
## 4 75,000 to 99,999 60765 55384 50044 55298 65225 71703 74269
## 5 100,000 to 124,999 23511 22620 20477 24341 29118 33584 38974
## 6 125,000 to 149,999 6613 6635 6507 7501 9608 11577 14770
## 7 150,000 to 174,999 3321 3249 3275 3437 4160 4409 5153
## 8 175,000 to 199,999 1324 1450 1560 1895 2222 2226 2330
## 9 Over 200,000 2609 2632 2786 3094 3833 3861 4147
## Y2014 Y2015 Y2016 Y2017
## 1 4022 3870 4320 6983
## 2 23797 20524 17855 12321
## 3 132372 142865 142847 105827
## 4 87998 105383 117529 99326
## 5 45443 55288 64982 59988
## 6 18412 23911 30079 29416
## 7 6402 8437 11606 11962
## 8 2704 3205 3951 4339
## 9 4821 5369 6180 5945
Education dataframe:
close(con)
con <- file(filename, "r")
education <- read.csv(con, quote = "\"", sep = ",", nrows = 10, skip = 137,
header = FALSE, stringsAsFactors = FALSE)
# Discard the last column which contains totals over all years
education <- education %>% select(-V13)
# Set the column names corresponding to each year
education <- setNames(education, c("EducationLevel", yearsStr))
# Discard the commas and convert to numeric
education$Y2007 <- as.numeric(gsub(",", "", education$Y2007))
education$Y2008 <- as.numeric(gsub(",", "", education$Y2008))
education$Y2009 <- as.numeric(gsub(",", "", education$Y2009))
education$Y2010 <- as.numeric(gsub(",", "", education$Y2010))
education$Y2011 <- as.numeric(gsub(",", "", education$Y2011))
education$Y2012 <- as.numeric(gsub(",", "", education$Y2012))
education$Y2013 <- as.numeric(gsub(",", "", education$Y2013))
education$Y2014 <- as.numeric(gsub(",", "", education$Y2014))
education$Y2015 <- as.numeric(gsub(",", "", education$Y2015))
education$Y2016 <- as.numeric(gsub(",", "", education$Y2016))
education$Y2017 <- as.numeric(gsub(",", "", education$Y2017))
education
## EducationLevel Y2007 Y2008 Y2009 Y2010
## 1 No Diploma 168 184 234 235
## 2 High School Graduate 771 502 469 433
## 3 Some College Credit (Less than 1 year) 148 106 97 74
## 4 One or More Years of College (No Degree) 878 594 606 606
## 5 Associates Degree 726 574 652 532
## 6 Bachelor's Degree 143937 122941 104511 109478
## 7 Master's Degree 121987 116561 97703 96163
## 8 Professional Degree 14677 13353 13225 13387
## 9 Doctorate Degree 31212 30576 28538 27290
## 10 Other (*) 117 84 91 74
## Y2011 Y2012 Y2013 Y2014 Y2015 Y2016 Y2017
## 1 487 164 101 83 69 50 37
## 2 724 377 301 261 188 145 97
## 3 87 70 46 54 33 28 16
## 4 549 597 402 385 285 252 173
## 5 404 471 376 273 269 225 171
## 6 112334 146174 136453 146368 170865 180077 139055
## 7 113284 122325 125052 141470 159828 180961 165830
## 8 13279 12625 12206 12001 11812 11880 9863
## 9 27130 25188 24671 24995 25188 25602 20589
## 10 134 251 82 81 315 129 276
We can plot the annual numbers of beneficiaries with compensation in excess of $100,000:
s100 <- compensation[5:9,]
s100
## SalaryRange Y2007 Y2008 Y2009 Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
## 5 100,000 to 124,999 23511 22620 20477 24341 29118 33584 38974 45443 55288
## 6 125,000 to 149,999 6613 6635 6507 7501 9608 11577 14770 18412 23911
## 7 150,000 to 174,999 3321 3249 3275 3437 4160 4409 5153 6402 8437
## 8 175,000 to 199,999 1324 1450 1560 1895 2222 2226 2330 2704 3205
## 9 Over 200,000 2609 2632 2786 3094 3833 3861 4147 4821 5369
## Y2016 Y2017
## 5 64982 59988
## 6 30079 29416
## 7 11606 11962
## 8 3951 4339
## 9 6180 5945
# Discard the first column
s100 <- select(s100, -1)
s100cols <- data.frame(c(colSums(s100)))
Plot the annual trends in compensation above $100,000.
p <- ggplot(s100cols)
p+ geom_point(aes(x=yearsStr, y=s100cols)) + labs(x="Year", y="Salaries above $100,000")