library(tidyverse)
library(tidycensus)
library(tigris)
library(readxl)
library(tmap)
library(sf)
library(DT)
library(gt)

Read in the raw data

setwd("C:\\Users\\sfraley\\Downloads\\PCV")

areas <- readxl::read_excel("Zip Codes_Rural and Urban_Reference.xlsx", sheet =2,
                  col_types = c("text","numeric","numeric","numeric","numeric","numeric")) %>%
  rename("Zip" = `Zip Code`)

wage <- readxl::read_excel("Employee Wage and Zip Code Data.xlsx", sheet =1,
                            col_types = c("text","numeric","text","text")) %>%
  rename("Zip" = `Zip Code of Residence`)

\(-\) \(-\) \(-\) \(-\)

Create a “rural.flag” variable, for zip codes that have over 75% rural population. Also create “rural.degree”, per US Census Method to conduct three way ANOVA. Join zip code characteristics to wage data.

#find quantiles
join <- wage %>%
  left_join(areas, by = c("Zip" = "Zip")) %>%
  mutate(count = 1,
         per.rural = round(`% Rural`, 2)) %>% #round percent
  na.omit()

#75th quartile value


#flag for if values are above the 75th quartile
#degree flag per Census methodology to find level of rural pop
join <- join %>%
  mutate(
    rural.flag = ifelse(`per.rural` > 0.75, 1, 0),
         (rural.degree = case_when(
                       `% Rural` == 1 ~ "Fully Rural",
                       `% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
                       `% Rural` < .5 ~ "Urban",
  ))
)


print(paste0("Total rural zips: ", sum(join$rural.flag)))
## [1] "Total rural zips: 91"
\(-\) \(-\) \(-\) \(-\)
“join” data table

\(-\) \(-\) \(-\) \(-\)
Summary stats for employment and zip code data

First, if we are going to compare different means from the population (rural vs non-rural), test for statistical significance between the means to make sure we can discuss mean differences accurately. Start with three way ANOVA using the group flag. Because wage is in both hourly and salary format, convert them both to a biweekly amount to standardize to the same time period.

anova <- join %>%
  mutate(standardize.wage = ifelse(`Wage Type` == "Salary", Wage / 12, (Wage * 80))) %>%
  mutate(rural.degree = case_when(
                       `% Rural` == 1 ~ "Fully Rural",
                       `% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
                       `% Rural` < .5 ~ "Urban",
  ))

anova.res <- aov(standardize.wage ~ rural.degree, data = anova)
summary(anova.res)
##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## rural.degree   2 5.432e+08 271612637   16.95 7.66e-08 ***
## Residuals    484 7.755e+09  16022439                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA shows statistical significance. But we are going to use only two groups for analysis (rural and non rural), so use independent sample t-test to test the difference in means between two groups

Independent sample t-test

t.test <- t.test(Wage ~ rural.flag, var.equal = T, data = anova)
t.test
## 
##  Two Sample t-test
## 
## data:  Wage by rural.flag
## t = 5.9737, df = 485, p-value = 4.493e-09
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  24674.39 48862.20
## sample estimates:
## mean in group 0 mean in group 1 
##        52955.08        16186.78

t-test is also significant for difference in means

Plot and visualize data to get a better idea of what we are working with.

Plotting Salary vs % Rural, and Hourly vs % Rural

library(scales)
salary.chart <- join %>%
  filter(`Wage Type` == "Salary") %>%
  select(Zip, Wage, `% Rural`)
  
hourly.chart <- join %>%
  filter(`Wage Type` == "Hourly") %>%
  select(Zip, Wage, `% Rural`)
  
salary.chart%>%
  ggplot(aes(x = `% Rural`, y = Wage))+
 theme_classic() +
  geom_point(size = 3) +
  xlab("Percent Rural Population") +
  ylab("Salary") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::dollar_format(prefix="$"))

hourly.chart%>%
  ggplot(aes(x = `% Rural`, y = Wage))+
 theme_classic() +
  geom_point(size = 3)+
  xlab("Percent Rural Population") +
  ylab("Hourly") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::dollar_format(prefix="$"))


No clear relationship, appears to be more observations and variation in urban areas.
Salary wage distribution, all areas

ggplot(join, aes(x=Wage)) +
  theme_classic()+
  geom_histogram(data = subset(join, `Wage Type` == "Salary"), fill = "#007abd", alpha = 0.5, col = "black") +
    geom_histogram(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
     geom_vline(data = subset(join, `Wage Type` == "Salary"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
    geom_vline(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
    scale_x_continuous(labels = scales::dollar_format(prefix="$")) + 
  xlab("Salary") 


Hourly wage distribution, all areas

ggplot(join, aes(x=Wage)) +
  theme_classic()+
  geom_histogram(data = subset(join, `Wage Type` == "Hourly"), fill = "#007abd", alpha = 0.5, col = "black") +
    geom_histogram(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
    scale_x_continuous(labels = scales::dollar_format(prefix="$")) + 
   geom_vline(data = subset(join, `Wage Type` == "Hourly"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
    geom_vline(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
  xlab("Hourly")


Summary Table

summary.table <- join %>%
   group_by(rural.flag, `Wage Type`) %>%
   summarize(across(Wage, mean), 
             across(count, sum))

DT::datatable(summary.table)
overall.summary.table <- join %>%
  group_by(`Wage Type`) %>%
   summarize(across(Wage, mean), 
             across(count, sum))
print(overall.summary.table)
## # A tibble: 2 x 3
##   `Wage Type`    Wage count
##   <chr>         <dbl> <dbl>
## 1 Hourly         14.1   232
## 2 Salary      88000.    255
DT::datatable(overall.summary.table)

Grouping by Zip Codes
Use distinct function to find list of zip codes present in wage data, and their associated population data. Sum the count and rural flags to find the count of zip codes in wage data, and how many are rural..

wage.zips <- join %>%
  distinct(Zip, `Rural Population`, `% Rural`, rural.flag, count)

num.wage.zips <- sum(wage.zips$count)
num.rural.zips <- sum(wage.zips$rural.flag)

Zip code totals:

## Total zip codes in wage data:  230
## Zip codes that are rural:  29


Grouping by location / place based analysis
Plot zip codes using tigris() and sf() package to visually identify clusters.

\(-\) \(-\)

library(sf)
library(tigris)

options(tigris_use_cache = TRUE)

states <- tigris::states() %>%
  st_as_sf() %>%
  select(STATEFP, NAME)

zip.points <- tigris::zctas() %>%
  filter(ZCTA5CE20 %in% wage.zips$Zip) %>% st_centroid()

zip.plot <- wage.zips %>% 
  left_join(zip.points, by = c("Zip" = "ZCTA5CE20")) %>%
  mutate(index.id = substr(Zip, 0, 1),
         prefix = substr(Zip, 0, 3)) %>%
  st_as_sf() %>%
  st_join(states)


#st_write(zip.plot,"zip_plot2.shp")
tmap_mode("view") 
tm_shape(zip.plot) +
  tm_dots(col = c("index.id"), size = 0.7) +
  tm_shape(zip.plot) +
  tm_dots(col = c("rural.flag"), size = 0.2) +
  tm_shape(states) +
  tm_borders()

Visually, we see large clusters in Oregon and Minnesota that have high number of observations of both rural and urban zip codes. I want to look at variation in rural vs overall total wage averages for these specific geographies. Filter the state shapefiles for just an OR and MN boundary, use st_intersection() to selection wage observations in these two states.

cluster <- zip.plot %>%
  filter(NAME == "Oregon" | NAME == "Minnesota") %>%
  left_join(wage, by = c("Zip"))


tmap_mode("view") 
tm_shape(cluster) +
  tm_dots(col = c("index.id"), size = 0.7) +
  tm_shape(cluster) +
  tm_dots(col = c("rural.flag"), size = 0.2) +
  tm_shape(states) +
  tm_borders()
library(sf)


cluster.totals <- cluster %>%
  mutate(State = ifelse(STATEFP == "27", "Minnesota", "Oregon")) %>%
  group_by(State) %>%
  summarize_at(c("count", "rural.flag"), sum) %>%
  st_drop_geometry() 

cluster.totals %>%
  gt() %>%
  cols_label(count = "Total Wages Reported", rural.flag = "Wages Reported in Rural Zips")
State Total Wages Reported Wages Reported in Rural Zips
Minnesota 201 52
Oregon 131 31
cluster.mean <- cluster %>%
  group_by(STATEFP, `Wage Type`) %>%
  summarize_at(c("Wage"), mean) %>%
  select(STATEFP, `Wage Type`, Wage) %>%
  mutate(joinid = paste0(STATEFP, `Wage Type`)) %>%
  st_drop_geometry()

cluster.rural.mean <- cluster %>%
  group_by(STATEFP, rural.flag, `Wage Type`) %>%
  summarize_at(c("Wage"), mean) %>%
  select(STATEFP, rural.flag, `Wage Type`, Wage) %>%
  mutate(joinid = paste0(STATEFP, `Wage Type`))

cluster.table <- cluster.rural.mean %>%
  left_join(cluster.mean, by = c("joinid")) %>%
  filter(rural.flag == 1) %>%
  mutate(State = ifelse(STATEFP.x == "27", "Minnesota", "Oregon")) %>%
  select(State, `Wage Type.x`, "Rural Mean" = Wage.x, "Total Mean" = Wage.y) %>%
  mutate(Difference = round(`Rural Mean` - `Total Mean`, 2)) %>%
  st_drop_geometry()

rm(cluster.mean, cluster.rural.mean)

cluster.table %>%
  gt() %>%
  fmt_number(
    columns = c("Rural Mean", "Total Mean", "Difference"),
    decimals = 2,
    use_seps = T
  )
State Wage Type.x Rural Mean Total Mean Difference
Minnesota Hourly 13.90 14.61 −0.70
Minnesota Salary 69,199.60 91,693.03 −22,493.43
Oregon Hourly 14.79 14.91 −0.12
Oregon Salary 79,540.38 97,244.93 −17,704.56

\(-\) \(-\) ``
Grouping by Company
Look at rural employment by companies. Mean wages for companies that employ the most rural jobs.

companies <- join %>%
  group_by(`Company ID`) %>%
    summarize(across(Wage, mean), 
             across(count, sum),
             across(rural.flag, sum))
 print(paste0("Total wages in rural areas: ",sum(companies$rural.flag)))
## [1] "Total wages in rural areas: 91"
companies.wage <- join %>%
  filter(rural.flag == 1) %>%
  group_by(`Company ID`, `rural.flag`, `Wage Type`) %>%
    summarize(across(Wage, mean))

companies.wage %>%
  DT::datatable()