by Alexa Chenyang Li
H-1B is an employment-based, non-immigrant visa category for temporary foreign workers in the United States. For a foreign national to apply for H1-B visa, an US employer must offer a job and petition for H-1B visa with the US immigration department. This is the most common visa status applied for and held by international students once they complete college/ higher education (Masters, PhD) and work in a full-time position.
As an international and a STEM-major student who wants to secure a job in the U.S., asking sponsorship for H-1b is an indispensable process. Therefore, it is important for international students to know what is going on about H-1b, especially when we are confronting a flunctant H-1b petition situation today.
As a business analytics major student, whose future job tile is usually with the word “analyst”, learning the changes for analysts’ H-1b petition will be very useful for our job hunting. In this project, I will analyze which worksite have the most H-1b petition and the certified H-1b petition, Which employers send most number of H-1B visa applications, what kind of analyst will get the most H-1b petition and the certified H-1b petition, whether the H-1b petition or certified H-1b petition has certain linear relationship with the prevailing wage, what is the proportion of full-time job and part-time job in general H-1b and certified H-1b petition and the median for each variable.
getwd()
setwd("C:/Users/Alexa Li/Desktop/R_Class")
h1b_raw<-read.csv("C:/Users/Alexa Li/Desktop/R_Class/h1b_kaggle.csv")
Packages Required
library(dplyr)
library(stringr)
library(tidyverse)
library(doBy)
library(plyr)
library(psych)
What kind of analyst will get the highest amount of H-1b petition?
View(h1b_raw)
str(h1b_raw)
complete.cases(h1b_raw)
h1b_raw[complete.cases(h1b_raw), ]
h1b_raw[!h1b_raw$C == "N/A", ]->h1b
str(h1b)
h1b <- na.omit(h1b)
h1b <- h1b[-c(19797, 74904, 185702, 187388, 188030, 277184, 393684, 396132, 438355,
506707, 721439, 874759, 2051359),]
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(str_detect(JOB_TITLE, "ANALYST"))->analyst
analyst
orderBy(~JOB_TITLE, analyst)->analyst
ddply(analyst, .(analyst$JOB_TITLE), nrow)->counts_title_freq
names(counts_title_freq) <- c("JOB_TITLE","Freq")
counts_title_freq[order(counts_title_freq$Freq, decreasing = TRUE), ]->counts_sort_title
What kind of analyst will get the highest amount of certified H-1b petition?
analyst%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->certified
certified<-orderBy(~JOB_TITLE, certified)
ddply(certified, .(certified$JOB_TITLE), nrow)->counts_certified_analyst
names(counts_certified_analyst) <- c("JOB_TITLE","Freq")
counts_certified_analyst[order(counts_certified_analyst$Freq, decreasing = TRUE), ]->counts_sort_analyst
table(certified$JOB_TITLE) %>%
as.data.frame() %>%
mutate(,portion = Freq/nrow(analyst)) -> portion
portion_sort <- portion[-grep(pattern = "0", x = portion$Freq),]
Which worksite has the highest amount of H-1b petition?
ddply(h1b, .(WORKSITE), nrow)->counts_worksite
names(counts_worksite) <- c("WORKSITE","Freq")
counts_worksite[order(counts_worksite$Freq, decreasing = TRUE), ]->counts_sort_worksite
head(counts_sort_worksite, n=30)->worksite_thir
ggplot(data=worksite_thir, aes(WORKSITE,Freq,color=Freq))+geom_point()+theme(axis.text.x= element_text(angle = 45, vjust = 1, hjust = 1))
Which employers send the highest number of H-1B visa applications?
h1b
ddply(h1b, .(h1b$EMPLOYER_NAME), nrow)->counts_employer
names(counts_employer) <- c("EMPLOYER_NAME","Freq")
View(counts_employer)
counts_employer[order(counts_employer$Freq, decreasing = TRUE), ]->counts_sort_employer
The proportion of full-time job and part-time job in general H-1b petition
#Full Time Position#
sum(str_count(h1b$FULL_TIME_POSITION, "Y"))/nrow(h1b)
#Part Time Position#
sum(str_count(h1b$FULL_TIME_POSITION, "N"))/nrow(h1b)
The proportion of full-time job and part-time job in certified H-1b petition
#Full Time Position#
sum(str_count(certified$FULL_TIME_POSITION, "Y"))/nrow(h1b)
#Part Time Position#
sum(str_count(certified$FULL_TIME_POSITION, "N"))/nrow(h1b)
2016 Petition
h1b
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2016")->sixteen
sixteen%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->sixteen_certified
#Certified rate in 2016#
nrow(sixteen_certified)/nrow(sixteen)
#Summary and descriptive statistics for prevailing wage in 2016#
#General Petition#
summary(sixteen$PREVAILING_WAGE)
describe(sixteen$PREVAILING_WAGE)
#Certified Petition#
summary(sixteen_certified$PREVAILING_WAGE)
describe(sixteen_certified$PREVAILING_WAGE)
2015 Petition
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2015")->fifteen
fifteen%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->fifteen_certified
#Certified rate in 2015#
nrow(fifteen_certified)/nrow(fifteen)
#Summary and descriptive statistics for prevailing wage in 2015#
#General Petition#
summary(fifteen$PREVAILING_WAGE)
describe(fifteen$PREVAILING_WAGE)
#Certified Petition#
summary(fifteen_certified$PREVAILING_WAGE)
describe(fifteen_certified$PREVAILING_WAGE)
2014 Petition
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2014")->fourteen
fourteen%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->fourteen_certified
#Certified rate in 2014#
nrow(fourteen_certified)/nrow(fourteen)
#Summary and descriptive statistics for prevailing wage in 2014#
#General Petition#
summary(fourteen$PREVAILING_WAGE)
describe(fourteen$PREVAILING_WAGE)
#Certified Petition#
summary(fourteen_certified$PREVAILING_WAGE)
describe(fourteen_certified$PREVAILING_WAGE)
2013 Petition
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2013")->thirteen
thirteen%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->thirteen_certified
#Certified rate in 2013#
nrow(thirteen_certified)/nrow(thirteen)
#Summary and descriptive statistics for prevailing wage in 2013#
#General#
summary(thirteen$PREVAILING_WAGE)
describe(thirteen$PREVAILING_WAGE)
#Certified
summary(thirteen_certified$PREVAILING_WAGE)
describe(thirteen_certified$PREVAILING_WAGE)
2012 Petition
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2012")->twelve
twelve%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->twelve_certified
#Certified rate in 2012#
nrow(twelve_certified)/nrow(twelve)
#Summary and descriptive statistics for prevailing wage in 2012#
#General#
summary(twelve$PREVAILING_WAGE)
describe(twelve$PREVAILING_WAGE)
#Certified
summary(twelve_certified$PREVAILING_WAGE)
describe(twelve_certified$PREVAILING_WAGE)
2011 Petition
h1b%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(YEAR == "2011")->eleven
eleven%>%
select(X,CASE_STATUS, EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,lon,lat)%>%
filter(CASE_STATUS == "CERTIFIED")->eleven_certified
#Certified rate 2011
nrow(eleven_certified)/nrow(eleven)
#Summary and descriptive statistics for prevailing wage in 2011#
#General#
summary(eleven$PREVAILING_WAGE)
describe(eleven$PREVAILING_WAGE)
#Certified
summary(eleven_certified$PREVAILING_WAGE)
describe(eleven_certified$PREVAILING_WAGE)
[1] "X": int Number of each case
[2] "CASE_STATUS": Factor Status associated with the last significant event or decision. Valid values include "Certified," "Certified-Withdrawn," Denied," and "Withdrawn"
[3] "EMPLOYER_NAME": Factor Name of employer submitting labor condition application
[4] "SOC_NAME": Factor Occupational name associated with the SOC_CODE. SOC_CODE is the occupational code associated with the job being requested for temporary labor condition, as classified by the Standard Occupational Classification (SOC) System
[5] "JOB_TITLE": Factor Title of the job
[6] "FULL_TIME_POSITION": Factor Y = Full Time Position; N = Part Time Position
[7] "PREVAILING_WAGE": num Prevailing Wage for the job being requested for temporary labor condition. The wage is listed at annual scale in USD. The prevailing wage for a job position is defined as the average wage paid to similarly employed workers in the requested occupation in the area of intended employment. The prevailing wage is based on the employer's minimum requirements for the position
[8] "YEAR": int Year in which the H-1B visa petition was filed
[9] "WORKSITE": Factor City and State information of the foreign worker's intended area of employment
[10] "lon": num Longitude of the worksite
[11] "lat": num Latitude of the worksite
TBC