Good Jobs Lab Fellow- Assessment

library(tidyverse)
library(tidycensus)
library(tigris)
library(readxl)
library(tmap)
library(sf)
library(DT)
library(gt)

Read in the raw data

setwd("C:\\Users\\sfraley\\Downloads\\PCV")

areas <- readxl::read_excel("Zip Codes_Rural and Urban_Reference.xlsx", sheet =2,
                  col_types = c("text","numeric","numeric","numeric","numeric","numeric")) %>%
  rename("Zip" = `Zip Code`)

wage <- readxl::read_excel("Employee Wage and Zip Code Data.xlsx", sheet =1,
                            col_types = c("text","numeric","text","text")) %>%
  rename("Zip" = `Zip Code of Residence`)

\(-\) \(-\) \(-\) \(-\)

Create a “rural.flag” variable, for zip codes that have over 75% rural population. Also create “rural.degree”, per US Census Method to conduct three way ANOVA. Join zip code characteristics to wage data.

#find quantiles
join <- wage %>%
  left_join(areas, by = c("Zip" = "Zip")) %>%
  mutate(count = 1,
         per.rural = round(`% Rural`, 2)) %>% #round percent
  na.omit()

#75th quartile value


#flag for if values are above the 75th quartile
#degree flag per Census methodology to find level of rural pop
join <- join %>%
  mutate(
    rural.flag = ifelse(`per.rural` > 0.75, 1, 0),
         (rural.degree = case_when(
                       `% Rural` == 1 ~ "Fully Rural",
                       `% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
                       `% Rural` < .5 ~ "Urban",
  ))
)


print(paste0("Total rural zips: ", sum(join$rural.flag)))

## [1] "Total rural zips: 91"

\(-\) \(-\) \(-\) \(-\) “join” data table

\(-\) \(-\) \(-\) \(-\)
Summary stats for employment and zip code data

First, if we are going to compare different means from the population (rural vs non-rural), test for statistical significance between the means to make sure we can discuss mean differences accurately. Start with three way ANOVA using the group flag. Because wage is in both hourly and salary format, convert them both to a biweekly amount to standardize to the same time period.

anova <- join %>%
  mutate(standardize.wage = ifelse(`Wage Type` == "Salary", Wage / 12, (Wage * 80))) %>%
  mutate(rural.degree = case_when(
                       `% Rural` == 1 ~ "Fully Rural",
                       `% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
                       `% Rural` < .5 ~ "Urban",
  ))

anova.res <- aov(standardize.wage ~ rural.degree, data = anova)
summary(anova.res)

##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## rural.degree   2 5.432e+08 271612637   16.95 7.66e-08 ***
## Residuals    484 7.755e+09  16022439                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA shows statistical significance. But we are going to use only two groups for analysis (rural and non rural), so use independent sample t-test to test the difference in means between two groups

Independent sample t-test

t.test <- t.test(Wage ~ rural.flag, var.equal = T, data = anova)
t.test

## 
##  Two Sample t-test
## 
## data:  Wage by rural.flag
## t = 5.9737, df = 485, p-value = 4.493e-09
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  24674.39 48862.20
## sample estimates:
## mean in group 0 mean in group 1 
##        52955.08        16186.78

t-test is also significant for difference in means

Plot and visualize data to get a better idea of what we are working with.

Plotting Salary vs % Rural, and Hourly vs % Rural

library(scales)
salary.chart <- join %>%
  filter(`Wage Type` == "Salary") %>%
  select(Zip, Wage, `% Rural`)
  
hourly.chart <- join %>%
  filter(`Wage Type` == "Hourly") %>%
  select(Zip, Wage, `% Rural`)
  
salary.chart%>%
  ggplot(aes(x = `% Rural`, y = Wage))+
 theme_classic() +
  geom_point(size = 3) +
  xlab("Percent Rural Population") +
  ylab("Salary") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::dollar_format(prefix="$"))

hourly.chart%>%
  ggplot(aes(x = `% Rural`, y = Wage))+
 theme_classic() +
  geom_point(size = 3)+
  xlab("Percent Rural Population") +
  ylab("Hourly") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::dollar_format(prefix="$"))

No clear relationship, appears to be more observations and variation in urban areas.
Salary wage distribution, all areas

ggplot(join, aes(x=Wage)) +
  theme_classic()+
  geom_histogram(data = subset(join, `Wage Type` == "Salary"), fill = "#007abd", alpha = 0.5, col = "black") +
    geom_histogram(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
     geom_vline(data = subset(join, `Wage Type` == "Salary"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
    geom_vline(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
    scale_x_continuous(labels = scales::dollar_format(prefix="$")) + 
  xlab("Salary")

Hourly wage distribution, all areas

ggplot(join, aes(x=Wage)) +
  theme_classic()+
  geom_histogram(data = subset(join, `Wage Type` == "Hourly"), fill = "#007abd", alpha = 0.5, col = "black") +
    geom_histogram(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
    scale_x_continuous(labels = scales::dollar_format(prefix="$")) + 
   geom_vline(data = subset(join, `Wage Type` == "Hourly"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
    geom_vline(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
  xlab("Hourly")

Summary Table

summary.table <- join %>%
   group_by(rural.flag, `Wage Type`) %>%
   summarize(across(Wage, mean), 
             across(count, sum))

DT::datatable(summary.table)

overall.summary.table <- join %>%
  group_by(`Wage Type`) %>%
   summarize(across(Wage, mean), 
             across(count, sum))
print(overall.summary.table)

## # A tibble: 2 x 3
##   `Wage Type`    Wage count
##   <chr>         <dbl> <dbl>
## 1 Hourly         14.1   232
## 2 Salary      88000.    255

DT::datatable(overall.summary.table)

Grouping by Zip Codes
Use distinct function to find list of zip codes present in wage data, and their associated population data. Sum the count and rural flags to find the count of zip codes in wage data, and how many are rural..

wage.zips <- join %>%
  distinct(Zip, `Rural Population`, `% Rural`, rural.flag, count)

num.wage.zips <- sum(wage.zips$count)
num.rural.zips <- sum(wage.zips$rural.flag)

Zip code totals:

## Total zip codes in wage data:  230

## Zip codes that are rural:  29

Grouping by location / place based analysis
Plot zip codes using tigris() and sf() package to visually identify clusters.

\(-\) \(-\)

library(sf)
library(tigris)

options(tigris_use_cache = TRUE)

states <- tigris::states() %>%
  st_as_sf() %>%
  select(STATEFP, NAME)

zip.points <- tigris::zctas() %>%
  filter(ZCTA5CE20 %in% wage.zips$Zip) %>% st_centroid()

zip.plot <- wage.zips %>% 
  left_join(zip.points, by = c("Zip" = "ZCTA5CE20")) %>%
  mutate(index.id = substr(Zip, 0, 1),
         prefix = substr(Zip, 0, 3)) %>%
  st_as_sf() %>%
  st_join(states)


#st_write(zip.plot,"zip_plot2.shp")

tmap_mode("view") 
tm_shape(zip.plot) +
  tm_dots(col = c("index.id"), size = 0.7) +
  tm_shape(zip.plot) +
  tm_dots(col = c("rural.flag"), size = 0.2) +
  tm_shape(states) +
  tm_borders()

Visually, we see large clusters in Oregon and Minnesota that have high number of observations of both rural and urban zip codes. I want to look at variation in rural vs overall total wage averages for these specific geographies. Filter the state shapefiles for just an OR and MN boundary, use st_intersection() to selection wage observations in these two states.

cluster <- zip.plot %>%
  filter(NAME == "Oregon" | NAME == "Minnesota") %>%
  left_join(wage, by = c("Zip"))


tmap_mode("view") 
tm_shape(cluster) +
  tm_dots(col = c("index.id"), size = 0.7) +
  tm_shape(cluster) +
  tm_dots(col = c("rural.flag"), size = 0.2) +
  tm_shape(states) +
  tm_borders()

library(sf)


cluster.totals <- cluster %>%
  mutate(State = ifelse(STATEFP == "27", "Minnesota", "Oregon")) %>%
  group_by(State) %>%
  summarize_at(c("count", "rural.flag"), sum) %>%
  st_drop_geometry() 

cluster.totals %>%
  gt() %>%
  cols_label(count = "Total Wages Reported", rural.flag = "Wages Reported in Rural Zips")

State	Total Wages Reported	Wages Reported in Rural Zips
Minnesota	201	52
Oregon	131	31

cluster.mean <- cluster %>%
  group_by(STATEFP, `Wage Type`) %>%
  summarize_at(c("Wage"), mean) %>%
  select(STATEFP, `Wage Type`, Wage) %>%
  mutate(joinid = paste0(STATEFP, `Wage Type`)) %>%
  st_drop_geometry()

cluster.rural.mean <- cluster %>%
  group_by(STATEFP, rural.flag, `Wage Type`) %>%
  summarize_at(c("Wage"), mean) %>%
  select(STATEFP, rural.flag, `Wage Type`, Wage) %>%
  mutate(joinid = paste0(STATEFP, `Wage Type`))

cluster.table <- cluster.rural.mean %>%
  left_join(cluster.mean, by = c("joinid")) %>%
  filter(rural.flag == 1) %>%
  mutate(State = ifelse(STATEFP.x == "27", "Minnesota", "Oregon")) %>%
  select(State, `Wage Type.x`, "Rural Mean" = Wage.x, "Total Mean" = Wage.y) %>%
  mutate(Difference = round(`Rural Mean` - `Total Mean`, 2)) %>%
  st_drop_geometry()

rm(cluster.mean, cluster.rural.mean)

cluster.table %>%
  gt() %>%
  fmt_number(
    columns = c("Rural Mean", "Total Mean", "Difference"),
    decimals = 2,
    use_seps = T
  )

State	Wage Type.x	Rural Mean	Total Mean	Difference
Minnesota	Hourly	13.90	14.61	−0.70
Minnesota	Salary	69,199.60	91,693.03	−22,493.43
Oregon	Hourly	14.79	14.91	−0.12
Oregon	Salary	79,540.38	97,244.93	−17,704.56

\(-\) \(-\) ``
Grouping by Company
Look at rural employment by companies. Mean wages for companies that employ the most rural jobs.

companies <- join %>%
  group_by(`Company ID`) %>%
    summarize(across(Wage, mean), 
             across(count, sum),
             across(rural.flag, sum))
 print(paste0("Total wages in rural areas: ",sum(companies$rural.flag)))

## [1] "Total wages in rural areas: 91"

companies.wage <- join %>%
  filter(rural.flag == 1) %>%
  group_by(`Company ID`, `rural.flag`, `Wage Type`) %>%
    summarize(across(Wage, mean))

companies.wage %>%
  DT::datatable()

Good Jobs Lab Fellow- Assessment

Sam Fraley

4/30/2023