library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(readxl)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Import the data

sd <- read_excel("~/Downloads/sd.xlsx", col_types = c("text", 
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text", 
"text", "numeric", "text"))
## Warning in strptime(x, format, tz = tz): unknown timezone 'default/America/
## Los_Angeles'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in W146963 / R146963C23: '45870'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Coercing text to numeric in W164631 / R164631C23: '76700'

Basic Examination

glimpse(sd)
## Observations: 167,278
## Variables: 24
## $ CASE_STATUS                    <chr> "denied", "denied", "denied", "...
## $ CASE_RECEIVED_DATE             <chr> "3/14/2014", "3/19/2015", "9/13...
## $ DECISION_DATE                  <chr> "3/21/2014", "3/19/2015", "9/23...
## $ EMPLOYER_NAME                  <chr> "ADVANCED TECHNOLOGY GROUP USA,...
## $ PREVAILING_WAGE_SUBMITTED      <chr> "6217100", "5067600", "4947000"...
## $ PREVAILING_WAGE_SUBMITTED_UNIT <chr> "year", "year", "year", "month"...
## $ PAID_WAGE_SUBMITTED            <chr> "62171", "91440", "49470", "438...
## $ PAID_WAGE_SUBMITTED_UNIT       <chr> "year", "year", "year", "year",...
## $ JOB_TITLE                      <chr> "SOFTWARE ENGINEER", "Assistant...
## $ WORK_CITY                      <chr> "BLOOMINGTON", "SAN FRANCISCO",...
## $ EDUCATION_LEVEL_REQUIRED       <chr> "NA", "Doctorate", "NA", "NA", ...
## $ COLLEGE_MAJOR_REQUIRED         <chr> "NA", "marketing", "NA", "NA", ...
## $ EXPERIENCE_REQUIRED_Y_N        <chr> "NA", "n", "NA", "NA", "NA", "N...
## $ EXPERIENCE_REQUIRED_NUM_MONTHS <chr> "NA", NA, "NA", "NA", "NA", "NA...
## $ COUNTRY_OF_CITIZENSHIP         <chr> "NA", "IRAN", "NA", "NA", "NA",...
## $ PREVAILING_WAGE_SOC_CODE       <chr> "15-1132", "25-1011", "25-2052"...
## $ PREVAILING_WAGE_SOC_TITLE      <chr> "Software Developers, Applicati...
## $ WORK_STATE                     <chr> "IL", "CALIFORNIA", "CA", "TX",...
## $ WORK_POSTAL_CODE               <chr> "NA", "94132.0", "NA", "NA", "N...
## $ FULL_TIME_POSITION_Y_N         <chr> "y", "NA", "y", "y", "y", "y", ...
## $ VISA_CLASS                     <chr> "H-1B", "greencard", "H-1B", "H...
## $ PREVAILING_WAGE_PER_YEAR       <chr> "NA", "NA", "NA", "NA", "NA", "...
## $ PAID_WAGE_PER_YEAR             <dbl> 62171, 91440, 49470, 43800, 170...
## $ JOB_TITLE_SUBGROUP             <chr> "software engineer", "assistant...

Do some conversions

sd$dec_dt = mdy(sd$DECISION_DATE)
sd$dec_yr = year(sd$dec_dt)
table(sd$dec_yr)
## 
##  2011  2012  2013  2014  2015 
##  7063  6901 48641 64764 39909
sd$full = (sd$FULL_TIME_POSITION_Y_N)
table(sd$full)
## 
##      n     NA      y 
##   3591  11093 152594
# Convert NA to Miss
sd$full[is.na(sd$full)] = "Miss"
table(sd$full)
## 
##      n     NA      y 
##   3591  11093 152594
#Hmm
sum(is.na(sd$full))
## [1] 0
#OK
sd$full[sd$full=="NA"] = "Miss"
table(sd$full)
## 
##   Miss      n      y 
##  11093   3591 152594
sd$full = factor(sd$full)

sd$jts = factor(sd$JOB_TITLE_SUBGROUP)
table(sd$jts)
## 
##   assistant professor              attorney      business analyst 
##                 18866                  1488                 27811 
##          data analyst        data scientist management consultant 
##                  3840                  1227                   770 
##     software engineer               teacher 
##                 99364                 13912
sd$pw = sd$PAID_WAGE_PER_YEAR
summary(sd$pw)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10500   63000   78600   85533  100006 2500000
table(sd$WORK_STATE)
## 
##                   AK                   AL              ALABAMA 
##                   60                  542                   17 
##               ALASKA                   AR              ARIZONA 
##                    6                  537                  206 
##             ARKANSAS                   AZ                   CA 
##                   21                 2364                42677 
##           CALIFORNIA                   CO             COLORADO 
##                 4105                 1526                   88 
##          CONNECTICUT                   CT                   DC 
##                   82                 1941                 1325 
##                   DE             DELAWARE DISTRICT OF COLUMBIA 
##                  823                   51                   45 
##                   FL              FLORIDA                   GA 
##                 3901                  163                 5387 
##              GEORGIA                   GU                 GUAM 
##                  228                   61                    1 
##               HAWAII                   HI                   IA 
##                    8                  162                  954 
##                   ID                IDAHO                   IL 
##                  180                    6                 7057 
##             ILLINOIS                   IN              INDIANA 
##                  354                 1570                   47 
##                 IOWA               KANSAS             KENTUCKY 
##                   58                   51                   52 
##                   KS                   KY                   LA 
##                  695                  589                  824 
##            LOUISIANA                   MA                MAINE 
##                   59                 6398                   11 
##             MARYLAND        MASSACHUSETTS                   MD 
##                  153                  450                 3122 
##                   ME                   MI             MICHIGAN 
##                  205                 3625                  219 
##            MINNESOTA          MISSISSIPPI             MISSOURI 
##                   82                   13                  112 
##                   MN                   MO              MONTANA 
##                 2034                 2661                    7 
##                   MP                   MS                   MT 
##                   58                  330                   56 
##                   NC                   ND                   NE 
##                 3402                  133                  563 
##             NEBRASKA               NEVADA        NEW HAMPSHIRE 
##                   35                   19                   16 
##           NEW JERSEY           NEW MEXICO             NEW YORK 
##                  999                   38                  551 
##                   NH                   NJ                   NM 
##                  411                 9199                  537 
##       NORTH CAROLINA         NORTH DAKOTA                   NV 
##                  176                    5                  408 
##                   NY                   OH                 OHIO 
##                10822                 3053                  139 
##                   OK             OKLAHOMA                   OR 
##                  532                   19                 1319 
##               OREGON                   PA         PENNSYLVANIA 
##                  216                 4411                  314 
##                   PR          PUERTO RICO                   PW 
##                  108                    1                    1 
##         RHODE ISLAND                   RI                   SC 
##                   21                  507                  823 
##                   SD       SOUTH CAROLINA         SOUTH DAKOTA 
##                   91                   28                   14 
##            TENNESSEE                TEXAS                   TN 
##                   39                 1012                 1048 
##                   TX                   UT                 UTAH 
##                14486                  719                   38 
##                   VA              VERMONT                   VI 
##                 5709                   41                   97 
##       VIRGIN ISLANDS             VIRGINIA                   VT 
##                   12                  322                   90 
##                   WA           WASHINGTON        WEST VIRGINIA 
##                 4354                  256                   10 
##                   WI            WISCONSIN                   WV 
##                 1357                   76                  297 
##                   WY              WYOMING 
##                   44                    1
# Annoying! We could rename the complete spellings or do something fast and dirty.

sd$WORK_STATE[nchar(sd$WORK_STATE) > 2 ] = "Miss"
table(sd$WORK_STATE)
## 
##    AK    AL    AR    AZ    CA    CO    CT    DC    DE    FL    GA    GU 
##    60   542   537  2364 42677  1526  1941  1325   823  3901  5387    61 
##    HI    IA    ID    IL    IN    KS    KY    LA    MA    MD    ME    MI 
##   162   954   180  7057  1570   695   589   824  6398  3122   205  3625 
##  Miss    MN    MO    MP    MS    MT    NC    ND    NE    NH    NJ    NM 
## 11093  2034  2661    58   330    56  3402   133   563   411  9199   537 
##    NV    NY    OH    OK    OR    PA    PR    PW    RI    SC    SD    TN 
##   408 10822  3053   532  1319  4411   108     1   507   823    91  1048 
##    TX    UT    VA    VI    VT    WA    WI    WV    WY 
## 14486   719  5709    97    90  4354  1357   297    44
# Interesting!!

Examine paid wage per year relationship with various categories

summary(sd$jts)
##   assistant professor              attorney      business analyst 
##                 18866                  1488                 27811 
##          data analyst        data scientist management consultant 
##                  3840                  1227                   770 
##     software engineer               teacher 
##                 99364                 13912
sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + coord_flip()

# Outlier, what to do?
#sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + coord_flip() + scale_x_log10()

# Hmm
sd %>% ggplot(aes(x=jts,y=pw)) + geom_boxplot() + scale_y_log10() + coord_flip()