Data607_Project2

Employment-Based Immigration and Citizenship Data

Description from https://www.uscis.gov/tools/reports-studies/immigration-forms-data is as follows:

“This section contains reports on the number of petitions and applications for temporary or lawful permanent resident status based on needed job skills.

Transparency for U.S. Workers

H-1B Datasets: These datasets provide information about the hiring practices of employers who petition for foreign national workers."

There are several frames in this data set. The CSV file does not appear to follow a dataframe-compatible format.

library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(utils)

filename <- "C:/Users/vikas/cuny/data607/projects/Project_2/H1B_Data/h-1b-2007-2017-trend-tables.csv"


con <- file(filename, "r")
whole <- readLines(con)

lines <- unlist(str_split(whole[7], ","))
years <- as.integer(lines[2:12])
years

##  [1] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017

yearsStr <- c("Y2007", "Y2008", "Y2009", "Y2010", 
              "Y2011", "Y2012", "Y2013", "Y2014",
              "Y2015", "Y2016", "Y2017")

This dataset contains a number of tables pertaining to employment-related immigration, such as beneficiaries by Country, Age, Occupation and Industry. Here we shall restrict our analysis to the data based on Compensation and based on Education levels.

Compensation dataframe:

close(con)
con <- file(filename, "r")
compensation <- read.csv(con, quote = "\"", sep = ",", nrows = 9, skip = 120,
                         header = FALSE, stringsAsFactors = FALSE)

# Discard the last column which contains blanks.
compensation <- compensation %>% select(-V13)

# Set the column names corresponding to each year
compensation <- setNames(compensation, c("SalaryRange", yearsStr))

# Discard the commas and convert to numeric
compensation$Y2007 <- as.numeric(gsub(",", "", compensation$Y2007))
compensation$Y2008 <- as.numeric(gsub(",", "", compensation$Y2008))
compensation$Y2009 <- as.numeric(gsub(",", "", compensation$Y2009))
compensation$Y2010 <- as.numeric(gsub(",", "", compensation$Y2010))
compensation$Y2011 <- as.numeric(gsub(",", "", compensation$Y2011))
compensation$Y2012 <- as.numeric(gsub(",", "", compensation$Y2012))
compensation$Y2013 <- as.numeric(gsub(",", "", compensation$Y2013))
compensation$Y2014 <- as.numeric(gsub(",", "", compensation$Y2014))
compensation$Y2015 <- as.numeric(gsub(",", "", compensation$Y2015))
compensation$Y2016 <- as.numeric(gsub(",", "", compensation$Y2016))
compensation$Y2017 <- as.numeric(gsub(",", "", compensation$Y2017))

compensation

##          SalaryRange  Y2007  Y2008  Y2009  Y2010  Y2011  Y2012  Y2013
## 1       Below 25,000   5704   5061   7066   5628   6236   5999   4376
## 2   25,000 to 49,999  75047  59642  51630  41772  36361  34103  26813
## 3   50,000 to 74,999 135727 128802 102781 105306 111649 140780 128858
## 4   75,000 to 99,999  60765  55384  50044  55298  65225  71703  74269
## 5 100,000 to 124,999  23511  22620  20477  24341  29118  33584  38974
## 6 125,000 to 149,999   6613   6635   6507   7501   9608  11577  14770
## 7 150,000 to 174,999   3321   3249   3275   3437   4160   4409   5153
## 8 175,000 to 199,999   1324   1450   1560   1895   2222   2226   2330
## 9       Over 200,000   2609   2632   2786   3094   3833   3861   4147
##    Y2014  Y2015  Y2016  Y2017
## 1   4022   3870   4320   6983
## 2  23797  20524  17855  12321
## 3 132372 142865 142847 105827
## 4  87998 105383 117529  99326
## 5  45443  55288  64982  59988
## 6  18412  23911  30079  29416
## 7   6402   8437  11606  11962
## 8   2704   3205   3951   4339
## 9   4821   5369   6180   5945

Education dataframe:

close(con)
con <- file(filename, "r")
education <- read.csv(con, quote = "\"", sep = ",", nrows = 10, skip = 137,
                      header = FALSE, stringsAsFactors = FALSE)

# Discard the last column which contains totals over all years
education <- education %>% select(-V13)

# Set the column names corresponding to each year
education <- setNames(education, c("EducationLevel", yearsStr))

# Discard the commas and convert to numeric
education$Y2007 <- as.numeric(gsub(",", "", education$Y2007))
education$Y2008 <- as.numeric(gsub(",", "", education$Y2008))
education$Y2009 <- as.numeric(gsub(",", "", education$Y2009))
education$Y2010 <- as.numeric(gsub(",", "", education$Y2010))
education$Y2011 <- as.numeric(gsub(",", "", education$Y2011))
education$Y2012 <- as.numeric(gsub(",", "", education$Y2012))
education$Y2013 <- as.numeric(gsub(",", "", education$Y2013))
education$Y2014 <- as.numeric(gsub(",", "", education$Y2014))
education$Y2015 <- as.numeric(gsub(",", "", education$Y2015))
education$Y2016 <- as.numeric(gsub(",", "", education$Y2016))
education$Y2017 <- as.numeric(gsub(",", "", education$Y2017))

education

##                              EducationLevel  Y2007  Y2008  Y2009  Y2010
## 1                                No Diploma    168    184    234    235
## 2                      High School Graduate    771    502    469    433
## 3    Some College Credit (Less than 1 year)    148    106     97     74
## 4  One or More Years of College (No Degree)    878    594    606    606
## 5                         Associates Degree    726    574    652    532
## 6                         Bachelor's Degree 143937 122941 104511 109478
## 7                           Master's Degree 121987 116561  97703  96163
## 8                       Professional Degree  14677  13353  13225  13387
## 9                          Doctorate Degree  31212  30576  28538  27290
## 10                                Other (*)    117     84     91     74
##     Y2011  Y2012  Y2013  Y2014  Y2015  Y2016  Y2017
## 1     487    164    101     83     69     50     37
## 2     724    377    301    261    188    145     97
## 3      87     70     46     54     33     28     16
## 4     549    597    402    385    285    252    173
## 5     404    471    376    273    269    225    171
## 6  112334 146174 136453 146368 170865 180077 139055
## 7  113284 122325 125052 141470 159828 180961 165830
## 8   13279  12625  12206  12001  11812  11880   9863
## 9   27130  25188  24671  24995  25188  25602  20589
## 10    134    251     82     81    315    129    276

We can plot the annual numbers of beneficiaries with compensation in excess of $100,000:

s100 <- compensation[5:9,]
s100

##          SalaryRange Y2007 Y2008 Y2009 Y2010 Y2011 Y2012 Y2013 Y2014 Y2015
## 5 100,000 to 124,999 23511 22620 20477 24341 29118 33584 38974 45443 55288
## 6 125,000 to 149,999  6613  6635  6507  7501  9608 11577 14770 18412 23911
## 7 150,000 to 174,999  3321  3249  3275  3437  4160  4409  5153  6402  8437
## 8 175,000 to 199,999  1324  1450  1560  1895  2222  2226  2330  2704  3205
## 9       Over 200,000  2609  2632  2786  3094  3833  3861  4147  4821  5369
##   Y2016 Y2017
## 5 64982 59988
## 6 30079 29416
## 7 11606 11962
## 8  3951  4339
## 9  6180  5945

# Discard the first column
s100 <- select(s100, -1)
s100cols <- data.frame(c(colSums(s100)))

Plot the annual trends in compensation above $100,000.

p <- ggplot(s100cols)
p+ geom_point(aes(x=yearsStr, y=s100cols)) + labs(x="Year", y="Salaries above $100,000")

Data607_Project2_part2

Vikas Sinha

October 7, 2017