library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readxl)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Import the data
sd <- read_excel("~/Downloads/sd.xlsx", col_types = c("text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "numeric", "text"))
## Warning in strptime(x, format, tz = tz): unknown timezone 'default/America/
## Los_Angeles'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in W146963 / R146963C23: '45870'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in W164631 / R164631C23: '76700'
Basic Examination
glimpse(sd)
## Observations: 167,278
## Variables: 24
## $ CASE_STATUS <chr> "denied", "denied", "denied", "...
## $ CASE_RECEIVED_DATE <chr> "3/14/2014", "3/19/2015", "9/13...
## $ DECISION_DATE <chr> "3/21/2014", "3/19/2015", "9/23...
## $ EMPLOYER_NAME <chr> "ADVANCED TECHNOLOGY GROUP USA,...
## $ PREVAILING_WAGE_SUBMITTED <chr> "6217100", "5067600", "4947000"...
## $ PREVAILING_WAGE_SUBMITTED_UNIT <chr> "year", "year", "year", "month"...
## $ PAID_WAGE_SUBMITTED <chr> "62171", "91440", "49470", "438...
## $ PAID_WAGE_SUBMITTED_UNIT <chr> "year", "year", "year", "year",...
## $ JOB_TITLE <chr> "SOFTWARE ENGINEER", "Assistant...
## $ WORK_CITY <chr> "BLOOMINGTON", "SAN FRANCISCO",...
## $ EDUCATION_LEVEL_REQUIRED <chr> "NA", "Doctorate", "NA", "NA", ...
## $ COLLEGE_MAJOR_REQUIRED <chr> "NA", "marketing", "NA", "NA", ...
## $ EXPERIENCE_REQUIRED_Y_N <chr> "NA", "n", "NA", "NA", "NA", "N...
## $ EXPERIENCE_REQUIRED_NUM_MONTHS <chr> "NA", NA, "NA", "NA", "NA", "NA...
## $ COUNTRY_OF_CITIZENSHIP <chr> "NA", "IRAN", "NA", "NA", "NA",...
## $ PREVAILING_WAGE_SOC_CODE <chr> "15-1132", "25-1011", "25-2052"...
## $ PREVAILING_WAGE_SOC_TITLE <chr> "Software Developers, Applicati...
## $ WORK_STATE <chr> "IL", "CALIFORNIA", "CA", "TX",...
## $ WORK_POSTAL_CODE <chr> "NA", "94132.0", "NA", "NA", "N...
## $ FULL_TIME_POSITION_Y_N <chr> "y", "NA", "y", "y", "y", "y", ...
## $ VISA_CLASS <chr> "H-1B", "greencard", "H-1B", "H...
## $ PREVAILING_WAGE_PER_YEAR <chr> "NA", "NA", "NA", "NA", "NA", "...
## $ PAID_WAGE_PER_YEAR <dbl> 62171, 91440, 49470, 43800, 170...
## $ JOB_TITLE_SUBGROUP <chr> "software engineer", "assistant...
Do some conversions
sd$dec_dt = mdy(sd$DECISION_DATE)
sd$dec_yr = year(sd$dec_dt)
table(sd$dec_yr)
##
## 2011 2012 2013 2014 2015
## 7063 6901 48641 64764 39909
sd$full = (sd$FULL_TIME_POSITION_Y_N)
table(sd$full)
##
## n NA y
## 3591 11093 152594
# Convert NA to Miss
sd$full[is.na(sd$full)] = "Miss"
table(sd$full)
##
## n NA y
## 3591 11093 152594
#Hmm
sum(is.na(sd$full))
## [1] 0
#OK
sd$full[sd$full=="NA"] = "Miss"
table(sd$full)
##
## Miss n y
## 11093 3591 152594
sd$full = factor(sd$full)
sd$jts = factor(sd$JOB_TITLE_SUBGROUP)
table(sd$jts)
##
## assistant professor attorney business analyst
## 18866 1488 27811
## data analyst data scientist management consultant
## 3840 1227 770
## software engineer teacher
## 99364 13912
sd$pw = sd$PAID_WAGE_PER_YEAR
summary(sd$pw)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10500 63000 78600 85533 100006 2500000
table(sd$WORK_STATE)
##
## AK AL ALABAMA
## 60 542 17
## ALASKA AR ARIZONA
## 6 537 206
## ARKANSAS AZ CA
## 21 2364 42677
## CALIFORNIA CO COLORADO
## 4105 1526 88
## CONNECTICUT CT DC
## 82 1941 1325
## DE DELAWARE DISTRICT OF COLUMBIA
## 823 51 45
## FL FLORIDA GA
## 3901 163 5387
## GEORGIA GU GUAM
## 228 61 1
## HAWAII HI IA
## 8 162 954
## ID IDAHO IL
## 180 6 7057
## ILLINOIS IN INDIANA
## 354 1570 47
## IOWA KANSAS KENTUCKY
## 58 51 52
## KS KY LA
## 695 589 824
## LOUISIANA MA MAINE
## 59 6398 11
## MARYLAND MASSACHUSETTS MD
## 153 450 3122
## ME MI MICHIGAN
## 205 3625 219
## MINNESOTA MISSISSIPPI MISSOURI
## 82 13 112
## MN MO MONTANA
## 2034 2661 7
## MP MS MT
## 58 330 56
## NC ND NE
## 3402 133 563
## NEBRASKA NEVADA NEW HAMPSHIRE
## 35 19 16
## NEW JERSEY NEW MEXICO NEW YORK
## 999 38 551
## NH NJ NM
## 411 9199 537
## NORTH CAROLINA NORTH DAKOTA NV
## 176 5 408
## NY OH OHIO
## 10822 3053 139
## OK OKLAHOMA OR
## 532 19 1319
## OREGON PA PENNSYLVANIA
## 216 4411 314
## PR PUERTO RICO PW
## 108 1 1
## RHODE ISLAND RI SC
## 21 507 823
## SD SOUTH CAROLINA SOUTH DAKOTA
## 91 28 14
## TENNESSEE TEXAS TN
## 39 1012 1048
## TX UT UTAH
## 14486 719 38
## VA VERMONT VI
## 5709 41 97
## VIRGIN ISLANDS VIRGINIA VT
## 12 322 90
## WA WASHINGTON WEST VIRGINIA
## 4354 256 10
## WI WISCONSIN WV
## 1357 76 297
## WY WYOMING
## 44 1
# Annoying! We could rename the complete spellings or do something fast and dirty.
sd$WORK_STATE[nchar(sd$WORK_STATE) > 2 ] = "Miss"
table(sd$WORK_STATE)
##
## AK AL AR AZ CA CO CT DC DE FL GA GU
## 60 542 537 2364 42677 1526 1941 1325 823 3901 5387 61
## HI IA ID IL IN KS KY LA MA MD ME MI
## 162 954 180 7057 1570 695 589 824 6398 3122 205 3625
## Miss MN MO MP MS MT NC ND NE NH NJ NM
## 11093 2034 2661 58 330 56 3402 133 563 411 9199 537
## NV NY OH OK OR PA PR PW RI SC SD TN
## 408 10822 3053 532 1319 4411 108 1 507 823 91 1048
## TX UT VA VI VT WA WI WV WY
## 14486 719 5709 97 90 4354 1357 297 44
# Interesting!!
Examine paid wage per year relationship with various categories
summary(sd$jts)
## assistant professor attorney business analyst
## 18866 1488 27811
## data analyst data scientist management consultant
## 3840 1227 770
## software engineer teacher
## 99364 13912
sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + coord_flip()
# Outlier, what to do?
#sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + coord_flip() + scale_x_log10()
# Hmm
sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + scale_y_log10() + coord_flip()