library(tidyverse)
library(tidycensus)
library(tigris)
library(readxl)
library(tmap)
library(sf)
library(DT)
library(gt)
Read in the raw data
setwd("C:\\Users\\sfraley\\Downloads\\PCV")
areas <- readxl::read_excel("Zip Codes_Rural and Urban_Reference.xlsx", sheet =2,
col_types = c("text","numeric","numeric","numeric","numeric","numeric")) %>%
rename("Zip" = `Zip Code`)
wage <- readxl::read_excel("Employee Wage and Zip Code Data.xlsx", sheet =1,
col_types = c("text","numeric","text","text")) %>%
rename("Zip" = `Zip Code of Residence`)
\(-\) \(-\) \(-\) \(-\)
Create a “rural.flag” variable, for zip codes that have over 75% rural population. Also create “rural.degree”, per US Census Method to conduct three way ANOVA. Join zip code characteristics to wage data.
#find quantiles
join <- wage %>%
left_join(areas, by = c("Zip" = "Zip")) %>%
mutate(count = 1,
per.rural = round(`% Rural`, 2)) %>% #round percent
na.omit()
#75th quartile value
#flag for if values are above the 75th quartile
#degree flag per Census methodology to find level of rural pop
join <- join %>%
mutate(
rural.flag = ifelse(`per.rural` > 0.75, 1, 0),
(rural.degree = case_when(
`% Rural` == 1 ~ "Fully Rural",
`% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
`% Rural` < .5 ~ "Urban",
))
)
print(paste0("Total rural zips: ", sum(join$rural.flag)))
## [1] "Total rural zips: 91"
\(-\) \(-\) \(-\) \(-\)
\(-\) \(-\) \(-\) \(-\)
Summary stats for employment and zip code data
First, if we are going to compare different means from the population (rural vs non-rural), test for statistical significance between the means to make sure we can discuss mean differences accurately. Start with three way ANOVA using the group flag. Because wage is in both hourly and salary format, convert them both to a biweekly amount to standardize to the same time period.
anova <- join %>%
mutate(standardize.wage = ifelse(`Wage Type` == "Salary", Wage / 12, (Wage * 80))) %>%
mutate(rural.degree = case_when(
`% Rural` == 1 ~ "Fully Rural",
`% Rural` > 0.5 & `% Rural` < 1 ~ "Mostly Rural",
`% Rural` < .5 ~ "Urban",
))
anova.res <- aov(standardize.wage ~ rural.degree, data = anova)
summary(anova.res)
## Df Sum Sq Mean Sq F value Pr(>F)
## rural.degree 2 5.432e+08 271612637 16.95 7.66e-08 ***
## Residuals 484 7.755e+09 16022439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA shows statistical significance. But we are going to use only two groups for analysis (rural and non rural), so use independent sample t-test to test the difference in means between two groups
Independent sample t-test
t.test <- t.test(Wage ~ rural.flag, var.equal = T, data = anova)
t.test
##
## Two Sample t-test
##
## data: Wage by rural.flag
## t = 5.9737, df = 485, p-value = 4.493e-09
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 24674.39 48862.20
## sample estimates:
## mean in group 0 mean in group 1
## 52955.08 16186.78
t-test is also significant for difference in means
Plot and visualize data to get a better idea of what we are working with.
Plotting Salary vs % Rural, and Hourly vs % Rural
library(scales)
salary.chart <- join %>%
filter(`Wage Type` == "Salary") %>%
select(Zip, Wage, `% Rural`)
hourly.chart <- join %>%
filter(`Wage Type` == "Hourly") %>%
select(Zip, Wage, `% Rural`)
salary.chart%>%
ggplot(aes(x = `% Rural`, y = Wage))+
theme_classic() +
geom_point(size = 3) +
xlab("Percent Rural Population") +
ylab("Salary") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::dollar_format(prefix="$"))
hourly.chart%>%
ggplot(aes(x = `% Rural`, y = Wage))+
theme_classic() +
geom_point(size = 3)+
xlab("Percent Rural Population") +
ylab("Hourly") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::dollar_format(prefix="$"))
No clear relationship, appears to be more observations and variation in urban areas.
Salary wage distribution, all areas
ggplot(join, aes(x=Wage)) +
theme_classic()+
geom_histogram(data = subset(join, `Wage Type` == "Salary"), fill = "#007abd", alpha = 0.5, col = "black") +
geom_histogram(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
geom_vline(data = subset(join, `Wage Type` == "Salary"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
geom_vline(data = subset(join, `Wage Type` == "Salary" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
scale_x_continuous(labels = scales::dollar_format(prefix="$")) +
xlab("Salary")
Hourly wage distribution, all areas
ggplot(join, aes(x=Wage)) +
theme_classic()+
geom_histogram(data = subset(join, `Wage Type` == "Hourly"), fill = "#007abd", alpha = 0.5, col = "black") +
geom_histogram(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), alpha = 0.8, fill = "#3349b0", col = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix="$")) +
geom_vline(data = subset(join, `Wage Type` == "Hourly"), aes(xintercept = mean(Wage)),col='green',size=2, alpha = 0.7) +
geom_vline(data = subset(join, `Wage Type` == "Hourly" & rural.flag == 1), aes(xintercept = mean(Wage)),col='red',size=2, alpha = 0.7) +
xlab("Hourly")
Summary Table
summary.table <- join %>%
group_by(rural.flag, `Wage Type`) %>%
summarize(across(Wage, mean),
across(count, sum))
DT::datatable(summary.table)
overall.summary.table <- join %>%
group_by(`Wage Type`) %>%
summarize(across(Wage, mean),
across(count, sum))
print(overall.summary.table)
## # A tibble: 2 x 3
## `Wage Type` Wage count
## <chr> <dbl> <dbl>
## 1 Hourly 14.1 232
## 2 Salary 88000. 255
DT::datatable(overall.summary.table)
Grouping by Zip Codes
Use distinct function to find list of zip codes present in wage data, and their associated population data. Sum the count and rural flags to find the count of zip codes in wage data, and how many are rural..
wage.zips <- join %>%
distinct(Zip, `Rural Population`, `% Rural`, rural.flag, count)
num.wage.zips <- sum(wage.zips$count)
num.rural.zips <- sum(wage.zips$rural.flag)
Zip code totals:
## Total zip codes in wage data: 230
## Zip codes that are rural: 29
Grouping by location / place based analysis
Plot zip codes using tigris() and sf() package to visually identify clusters.
\(-\) \(-\)
library(sf)
library(tigris)
options(tigris_use_cache = TRUE)
states <- tigris::states() %>%
st_as_sf() %>%
select(STATEFP, NAME)
zip.points <- tigris::zctas() %>%
filter(ZCTA5CE20 %in% wage.zips$Zip) %>% st_centroid()
zip.plot <- wage.zips %>%
left_join(zip.points, by = c("Zip" = "ZCTA5CE20")) %>%
mutate(index.id = substr(Zip, 0, 1),
prefix = substr(Zip, 0, 3)) %>%
st_as_sf() %>%
st_join(states)
#st_write(zip.plot,"zip_plot2.shp")
tmap_mode("view")
tm_shape(zip.plot) +
tm_dots(col = c("index.id"), size = 0.7) +
tm_shape(zip.plot) +
tm_dots(col = c("rural.flag"), size = 0.2) +
tm_shape(states) +
tm_borders()