library(stringr) # to manipulate data
library(lubridate) # to manipulate dates
library(ggplot2)
library(data.table)
# devtools::install_github("hrecht/censusapi") is a helpful package if you want to
# gather different or more current census data
Prior to doing my analysis I scraped H1B data from the web and saved the data to a comma-separated-values file.
Step 1: Cleaning our data using lubridate and stringr. Load the data using fread() (from the data.table package), which is faster than read.csv.
h1b_data <- fread("h1b_data.csv")
head(h1b_data)
## EMPLOYER JOB TITLE BASE SALARY LOCATION
## 1: INFOSYS SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 2: INFOSMART TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 3: OBJECTNET TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 4: WORK FORCE NETWORKS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 5: 1 WAY SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 6: VEDICSOFT SOLUTIONS LLC .NET DEVELOPER 61,422 ATLANTA, GA
## SUBMIT DATE START DATE CASE STATUS
## 1: 02/05/2015 07/14/2015 CERTIFIED
## 2: 02/23/2015 02/25/2015 CERTIFIED
## 3: 03/04/2015 09/01/2015 CERTIFIED
## 4: 03/17/2015 09/01/2015 CERTIFIED
## 5: 05/14/2015 10/24/2015 CERTIFIED
## 6: 05/27/2015 06/08/2015 CERTIFIED
Step 2: tidy the column names, i.e. rename them.
colnames(h1b_data) <- tolower(names(h1b_data))
(colnames(h1b_data) <- gsub(" ", "_", names(h1b_data)))
## [1] "employer" "job_title" "base_salary" "location" "submit_date"
## [6] "start_date" "case_status"
Step 3: check out the data we are dealing with using tail()
tail(h1b_data, 8)
## employer job_title
## 1: POWER CONVERSION TECHNOLOGIES INC ELECTRICAL ENGINEER
## 2: XYLEM WATER SOLUTIONS ZELIENOPLE LLC PRODUCT ENGINEER
## 3: GTI SPINDLE TECHNOLOGY INC PRODUCTION ENGINEER
## 4: LINN OPERATING INC GEOLOGIST
## 5: LINN OPERATING INC GEOLOGIST II
## 6: NANO-LUX OPTICAL NETWORK TESTING ENGINEER
## 7: MUFUMBO LABS INC SOFTWARE ENGINEER
## 8: ONR NATIONAL SPEECH PATHOLOGY INC OCCUPATIONAL THERAPIST
## base_salary location submit_date start_date case_status
## 1: 76,000 ZELIENOPLE, PA 04/22/2016 09/08/2016 CERTIFIED
## 2: 85,000 ZELIENOPLE, PA 03/11/2016 08/30/2016 CERTIFIED
## 3: 50,000 ZEBULON, NC 08/23/2016 08/30/2016 CERTIFIED
## 4: 119,000 ZAPATA, TX 06/14/2016 12/01/2016 CERTIFIED
## 5: 119,000 ZAPATA, TX 06/06/2016 12/01/2016 WITHDRAWN
## 6: 53,040 ZEPHYR COVE, NV 03/08/2016 09/03/2016 CERTIFIED
## 7: 72,000 ZEPHYR COVE, NV 03/16/2016 09/15/2016 CERTIFIED
## 8: 80,000 ZEARING, IA 04/24/2016 05/01/2016 CERTIFIED
Step 4: check the class of each column of data
apply(h1b_data, 2, class)
## employer job_title base_salary location submit_date start_date
## "character" "character" "character" "character" "character" "character"
## case_status
## "character"
Step 5: correct the columns that have the wrong class associated with their data change the dates to POSIxct format, start by replacing ‘/’ with ‘-’.
h1b_data$start_date <- gsub("/", "-", h1b_data$start_date)
# now parse data to date format
h1b_data$start_date <- mdy(h1b_data$start_date)
# check the class of h1b_data$start_date to make sure it is correct
class(h1b_data$start_date)
## [1] "Date"
Nice! Now, lets change all the date columns to the correct date format.
head(h1b_data)
## employer job_title base_salary location
## 1: INFOSYS SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 2: INFOSMART TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 3: OBJECTNET TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 4: WORK FORCE NETWORKS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 5: 1 WAY SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 6: VEDICSOFT SOLUTIONS LLC .NET DEVELOPER 61,422 ATLANTA, GA
## submit_date start_date case_status
## 1: 02/05/2015 2015-07-14 CERTIFIED
## 2: 02/23/2015 2015-02-25 CERTIFIED
## 3: 03/04/2015 2015-09-01 CERTIFIED
## 4: 03/17/2015 2015-09-01 CERTIFIED
## 5: 05/14/2015 2015-10-24 CERTIFIED
## 6: 05/27/2015 2015-06-08 CERTIFIED
h1b_data$submit_date <- gsub("/", "-", h1b_data$submit_date) %>%
mdy()
class(h1b_data$submit_date)
## [1] "Date"
Step 6: add a month column and a year column.
h1b_data$start_month <- lubridate::month(h1b_data$start_date, label = TRUE)
h1b_data$start_year <- lubridate::year(h1b_data$start_date)
# submit_month columns
h1b_data$submit_month <- lubridate::month(h1b_data$submit_date, label = TRUE)
h1b_data$submit_year <- year(h1b_data$submit_date)
head(h1b_data)
## employer job_title base_salary location
## 1: INFOSYS SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 2: INFOSMART TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 3: OBJECTNET TECHNOLOGIES INC .NET DEVELOPER 60,000 ATLANTA, GA
## 4: WORK FORCE NETWORKS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 5: 1 WAY SOLUTIONS INC .NET DEVELOPER 60,000 ATLANTA, GA
## 6: VEDICSOFT SOLUTIONS LLC .NET DEVELOPER 61,422 ATLANTA, GA
## submit_date start_date case_status start_month start_year submit_month
## 1: 2015-02-05 2015-07-14 CERTIFIED Jul 2015 Feb
## 2: 2015-02-23 2015-02-25 CERTIFIED Feb 2015 Feb
## 3: 2015-03-04 2015-09-01 CERTIFIED Sep 2015 Mar
## 4: 2015-03-17 2015-09-01 CERTIFIED Sep 2015 Mar
## 5: 2015-05-14 2015-10-24 CERTIFIED Oct 2015 May
## 6: 2015-05-27 2015-06-08 CERTIFIED Jun 2015 May
## submit_year
## 1: 2015
## 2: 2015
## 3: 2015
## 4: 2015
## 5: 2015
## 6: 2015
Step 7: change the base_salary class to numeric. First remove the comma.
h1b_data$base_salary <- gsub(",", "", h1b_data$base_salary) %>%
as.numeric()
class(h1b_data$base_salary)
## [1] "numeric"
We may want to separate our data by state for our analysis Step 8: create a matrix with state and city separated. Use str_split_fixed()
states <- str_split_fixed(h1b_data$location, ", ", 2)
head(states)
## [,1] [,2]
## [1,] "ATLANTA" "GA"
## [2,] "ATLANTA" "GA"
## [3,] "ATLANTA" "GA"
## [4,] "ATLANTA" "GA"
## [5,] "ATLANTA" "GA"
## [6,] "ATLANTA" "GA"
tail(states)
## [,1] [,2]
## [1071394,] "ZEBULON" "NC"
## [1071395,] "ZAPATA" "TX"
## [1071396,] "ZAPATA" "TX"
## [1071397,] "ZEPHYR COVE" "NV"
## [1071398,] "ZEPHYR COVE" "NV"
## [1071399,] "ZEARING" "IA"
Now, add a city and state column to h1b_data.
h1b_data$city <- states[, 1]
h1b_data$state <- states[, 2]
head(h1b_data)
## employer job_title base_salary location
## 1: INFOSYS SOLUTIONS INC .NET DEVELOPER 60000 ATLANTA, GA
## 2: INFOSMART TECHNOLOGIES INC .NET DEVELOPER 60000 ATLANTA, GA
## 3: OBJECTNET TECHNOLOGIES INC .NET DEVELOPER 60000 ATLANTA, GA
## 4: WORK FORCE NETWORKS INC .NET DEVELOPER 60000 ATLANTA, GA
## 5: 1 WAY SOLUTIONS INC .NET DEVELOPER 60000 ATLANTA, GA
## 6: VEDICSOFT SOLUTIONS LLC .NET DEVELOPER 61422 ATLANTA, GA
## submit_date start_date case_status start_month start_year submit_month
## 1: 2015-02-05 2015-07-14 CERTIFIED Jul 2015 Feb
## 2: 2015-02-23 2015-02-25 CERTIFIED Feb 2015 Feb
## 3: 2015-03-04 2015-09-01 CERTIFIED Sep 2015 Mar
## 4: 2015-03-17 2015-09-01 CERTIFIED Sep 2015 Mar
## 5: 2015-05-14 2015-10-24 CERTIFIED Oct 2015 May
## 6: 2015-05-27 2015-06-08 CERTIFIED Jun 2015 May
## submit_year city state
## 1: 2015 ATLANTA GA
## 2: 2015 ATLANTA GA
## 3: 2015 ATLANTA GA
## 4: 2015 ATLANTA GA
## 5: 2015 ATLANTA GA
## 6: 2015 ATLANTA GA
Where you able to find the added city and state columns in the h1b_data data frame? Pretty cool!
state_tally <- table(h1b_data$state)
state_tally <- data.frame(state = names(state_tally), h1b = as.vector(state_tally))
head(state_tally, 10)
## state h1b
## 1 7
## 2 AK 286
## 3 AL 2747
## 4 AR 5450
## 5 AZ 16512
## 6 CA 204006
## 7 CO 11174
## 8 CT 16637
## 9 DC 6951
## 10 DE 6488
Now that we know what our state_tally data frame looks like we we can visualize it using a barplot.
Step 9: make a barchart of the state_tally df.
ggplot(state_tally, aes(x = state, y = h1b)) +
geom_col()
Step 10: compare our data to census data from a different year. R has a built in df called state.x77 which contains census data for every state from the year 1977. Here’s what state.x77 looks like:
head(state.x77)
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## Alaska 365 6315 1.5 69.31 11.3 66.7 152
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## California 21198 5114 1.1 71.71 10.3 62.6 20
## Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Area
## Alabama 50708
## Alaska 566432
## Arizona 113417
## Arkansas 51945
## California 156361
## Colorado 103766
Step 11: merge the state.x77 data with the state_tally data.
state_data <- data.frame(state = state.abb, state.x77)
state_data <- merge(state_tally, state_data,
by = "state")
head(state_data)
## state h1b Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1 AK 286 365 6315 1.5 69.31 11.3 66.7 152
## 2 AL 2747 3615 3624 2.1 69.05 15.1 41.3 20
## 3 AR 5450 2110 3378 1.9 70.66 10.1 39.9 65
## 4 AZ 16512 2212 4530 1.8 70.55 7.8 58.1 15
## 5 CA 204006 21198 5114 1.1 71.71 10.3 62.6 20
## 6 CO 11174 2541 4884 0.7 72.06 6.8 63.9 166
## Area
## 1 566432
## 2 50708
## 3 51945
## 4 113417
## 5 156361
## 6 103766
Now that we have a tidy data frame we can find relationships between the variables. Here is an example question, do the number of h1b applications correlate with the total population of a state?
Step 12: find correlation between state population and h1b data using the cor() function.
cor(state_data$Population, state_data$h1b)
## [1] 0.8828295
Becuase the correlation between a states total population and its h1b submissions is close to 1, it is safe to say that there is a strong positive linear association between the two variables. But this should suprise us because one would think the larger the state’s population the more h1b submissions.
Lets do some less obvious investigation.
Step 13: make a scatterplot matrix to find relationships between multiple variables. Use pairs() to make a scatterplot matrix.
pairs(~ h1b + Population + Income + Illiteracy + HS.Grad,
data = state_data,
main = "H1B relationships between Population, Income, Illiteracy rate, and HS grad percentages")
Take a look at the relationship between h1b submissions and illiteracy rate. The data suggests that areas with lower illiteracy rates actually have more h1b applications. Could this mean immigrant workers are actually smart, not drug dealears, criminals, or rapists!? How do you think Mr. Trump would respond to this data?
We are almost done! One more question: are h1b applicants with higher salaries more likely to be approved or denied for a visa?
Step 14: make a visualization (i.e. a graph)
ggplot(data = h1b_data, aes(x = factor(case_status), y = base_salary, fill = as.factor(case_status))) +
geom_boxplot() +
ylim(0, 150000)
That’s interesting! The data suggests that an applicant’s base_salary doesn’t affect their chances of being certified or denied.