Data 607: Final Project

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(usmap)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Load Libraries

link <- 'https://raw.githubusercontent.com/beninbar/DATA-607/main/Final%20project/data/state_immigration_stats.csv'
state_data <- read_csv(link,show_col_types=FALSE)

## New names:
## • `` -> `...1`

How do wages for US-born vs foreign born differ based on available data

median_wages <- state_data |>
    group_by(state) |>
    summarise(foreign_born_median_wages = mean(foreign_median_wages),
              us_born_median_wages = mean(us_median_wages),
              diff_wages = us_born_median_wages-foreign_born_median_wages)

ggplot(data=median_wages,mapping=aes(x=us_born_median_wages,y=foreign_born_median_wages))+
    geom_point() +
    geom_jitter() +
    geom_text(aes(label=state),color='black',size=4,vjust=1) +
    geom_smooth(method='lm') +
    labs(title= '2019 Median Annual Wages by Birth',subtitle = 'Source: Migration Policy Institute via American Community Survey')

## `geom_smooth()` using formula 'y ~ x'

cor(median_wages$us_born_median_wages,median_wages$foreign_born_median_wages)

## [1] 0.8501121

t.test(median_wages$diff_wages,conf.level = .95)

## 
##  One Sample t-test
## 
## data:  median_wages$diff_wages
## t = 6.2153, df = 50, p-value = 1.012e-07
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  3743.393 7318.019
## sample estimates:
## mean of x 
##  5530.706

From reviewing the scatterplot of median pages by birth, it is clear there is a very strong linear pattern between the two suggesting that wages are not that different in gross terms between US and foreign born residents. The correlation between the two variables is very high which might indicate and would lead us to hypothesize that state employment factors are a much stronger predictor of employment for all of its residents. After running a single T-test it is also clear that there is a statistically significant difference between the two median wages with a 95% confidence interval for the true population estimates at (3743.393,7318.019)

Top/Bottom 5 Wage Differential

median_wages |>
    arrange(-diff_wages) |>
    head(5)

## # A tibble: 5 × 4
##   state foreign_born_median_wages us_born_median_wages diff_wages
##   <chr>                     <dbl>                <dbl>      <dbl>
## 1 NE                        49078                64765      15687
## 2 ND                        50782                65889      15107
## 3 MA                        74264                89285      15021
## 4 NM                        38877                53667      14790
## 5 RI                        58450                72867      14417

median_wages |>
    arrange(-diff_wages) |>
    tail(5)

## # A tibble: 5 × 4
##   state foreign_born_median_wages us_born_median_wages diff_wages
##   <chr>                     <dbl>                <dbl>      <dbl>
## 1 AR                        51567                48785      -2782
## 2 MS                        47899                45018      -2881
## 3 VA                        81382                75775      -5607
## 4 WV                        53858                46580      -7278
## 5 MI                        68753                58824      -9929

While it is interesting to see this information in a list perhaps showing the full map visualization will be a better holistic view of the whole country’s statistics.

US and Foreign Born Maps

Differential Map

plot_usmap(
    data = median_wages, values = "diff_wages", color = "white"
  ) + 
  scale_fill_continuous(
    low = "white", high = "blue", name = "Median Wages Difference (2019)", label = scales::comma
  ) + 
  labs(title = "Median Wage Comparison", subtitle = "US - Foreign Wages.") +
  theme(legend.position = "right")

## Warning: Ignoring unknown parameters: linewidth

Separated Median Wage Map

us_map <- plot_usmap(
    data = median_wages, values = "us_born_median_wages", color = "white"
  ) + 
  scale_fill_continuous(
    low = "white", high = "blue", name = "Median US Born Wages (2019)", label = scales::comma
  ) + 
  labs(title = "Median Wages (US Born)") +
  theme(legend.position = "right")

## Warning: Ignoring unknown parameters: linewidth

foreign_map <- plot_usmap(
    data = median_wages, values = "foreign_born_median_wages", color = "white"
  ) + 
  scale_fill_continuous(
    low = "white", high = "blue", name = "Median Foreign Born Wages (2019)", label = scales::comma
  ) + 
  labs(title = "Median Wages (Foreign Born)") +
  theme(legend.position = "right")

## Warning: Ignoring unknown parameters: linewidth

grid.arrange(us_map,foreign_map,ncol=2)

Data Cleanup

Prepare scraped data for better formattings and correct data types

state_bin <- state_data |> filter(order!=0) |>
    mutate(earning_bin = str_trim(str_replace_all(header,c('Earned'='', ', or incurred a loss'=''))),
           foreign_born_perc = as.numeric(str_replace(foreign_born,'%',''))/100,
           us_born_perc = as.numeric(str_replace(us_born,'%',''))/100)

Foreign Workers in Highest Wage Band

state_bin |> filter(earning_bin=='$75,000 or more') %>%
    ggplot(aes(reorder(state,foreign_born_perc),foreign_born_perc,fill='blue')) +
    theme(axis.text.x = element_text(angle=90,hjust=0.9),legend.position = 'none')+ 
    geom_bar(stat='identity') +
    labs(x='States',y='Foreign Born (%)', title='Percentage of Foreign Born making $75,000+')

Number of states in each binned wage range

state_bin_top <- state_bin |> 
    group_by(state) |> 
    mutate(top_rank_foreign = rank(-foreign_born_perc,ties.method = 'last'),
           top_rank_us=rank(-us_born_perc)) |> 
    pivot_longer(cols=c('top_rank_foreign','top_rank_us'),names_to='measure_name',values_to='measure_values') |>
    filter(measure_values==1)    


state_bin |> 
    group_by(state) |> 
    mutate(top_rank_foreign = rank(-foreign_born_perc),
           top_rank_us=rank(-us_born_perc)) |> 
    pivot_longer(cols=c('top_rank_foreign','top_rank_us'),names_to='measure_name',values_to='measure_values') |>
    filter(measure_values==1) %>%
    
    group_by(earning_bin,measure_name) |>
    summarise(count = n()) %>%
    ggplot(aes(reorder(earning_bin,count),count,fill=measure_name)) +
    geom_bar(stat='identity',position='dodge') +
    theme(axis.text.x = element_text(angle=90,hjust=0.9))+
    labs(x='Earnings Range',y='Number of States', title='Highest Frequency Wage Bins')

## `summarise()` has grouped output by 'earning_bin'. You can override using the
## `.groups` argument.

Where is the disparity occuring?

bin_diff <- state_bin_top  %>%
    group_by(state) |> 
    select(state,measure_name,order) |>
    pivot_wider(names_from=measure_name,values_from=order) |>
    mutate(top_rank = top_rank_us - top_rank_foreign)
    
plot_usmap( data = bin_diff, values = "top_rank", color = "white" ) + 
  scale_fill_continuous(
    low = "white", high = "blue", name = "Difference in most frequent wage range", label = scales::comma
  ) + 
  labs(title = "Bin Difference (US - Foreign)",subtitle='Bins are scaled 1 - 7 (positive values:US>Foreign; negative: Foreign>US)') +
  theme(legend.position = "right")

## Warning: Ignoring unknown parameters: linewidth